From 782dc78d094a7121dfb940f3976204661eda08e2 Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Tue, 16 Jul 2024 11:31:35 -0700 Subject: [PATCH] edits --- langchain_benchmarks/model_registration.py | 22 +- .../tool_usage/tasks/extraction_query.py | 480 +++++++++++++----- scripts/extract_task.py | 150 ++++-- 3 files changed, 492 insertions(+), 160 deletions(-) diff --git a/langchain_benchmarks/model_registration.py b/langchain_benchmarks/model_registration.py index 1edcdb9..9450293 100644 --- a/langchain_benchmarks/model_registration.py +++ b/langchain_benchmarks/model_registration.py @@ -162,9 +162,27 @@ "model": "gpt-4-32k-0314", }, ), + RegisteredModel( + provider="openai", + name="gpt-4o", + description="GPT-4o base model", + type="chat", + params={ + "model": "gpt-4o", + }, + ), ] _FIREWORKS_MODELS = [ + RegisteredModel( + provider="fireworks", + name="firefunction-v2", + type="chat", + description="Fireworks function calling model", + params={ + "model": "accounts/fireworks/models/firefunction-v2", + }, + ), RegisteredModel( provider="fireworks", name="llama-v2-7b-chat-fw", @@ -207,7 +225,9 @@ description="Mistral MoE 8x7B Instruct v0.1 model with Sparse " "Mixture of Experts. Fine tuned for instruction following", type="llm", - params={"model": "accounts/fireworks/models/mixtral-8x7b-instruct"}, + params={ + "model": "accounts/fireworks/models/mixtral-8x7b-instruct" + }, ), ] diff --git a/langchain_benchmarks/tool_usage/tasks/extraction_query.py b/langchain_benchmarks/tool_usage/tasks/extraction_query.py index 286d47a..dbc0aad 100644 --- a/langchain_benchmarks/tool_usage/tasks/extraction_query.py +++ b/langchain_benchmarks/tool_usage/tasks/extraction_query.py @@ -1,64 +1,65 @@ -''' -Notes: - -Model needs to be aware that "today" means 2024-01-01 -Need to provide it with a lot of context about what langchain/smith/graph are used for -''' - -from typing import List, Literal, Optional, Union, cast +from datetime import datetime +from typing import List, Literal, Union, cast -from langchain.tools import BaseTool, tool from langchain.pydantic_v1 import BaseModel, Field -from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask -from datetime import datetime +from langchain.tools import BaseTool, tool from langchain_core.messages import HumanMessage -from langsmith.schemas import Example, Run -from langsmith.evaluation.evaluator import ( - EvaluationResult, - EvaluationResults, - RunEvaluator, -) - - - +from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask class DocQuery(BaseModel): """Query against documentation""" - query: str = Field(...,description="The question to answer") - source: Literal["langchain", "langsmith", "langgraph"] = Field(...,description="The documentation source to search against. Should be one of 'langchain', 'langsmith', or " - "'langgraph' depending on which one product the user question pertains to") + query: str = Field(..., description="The question to answer") + source: Literal["langchain", "langsmith", "langgraph"] = Field( + ..., + description="The documentation source to search against. Should be one of 'langchain', 'langsmith', or " + "'langgraph' depending on which one product the user question pertains to", + ) + class TweetQuery(BaseModel): """Query against tweets""" - subject: str = Field(...,description="Subject to search for") - min_likes: Union[int, None] = Field(None,description="Minimum amount of likes on the tweet") - max_likes: Union[int, None] = Field(None,description="Maximum amount of likes on the tweet") - start_date: Union[datetime, None] = Field(None, description="Earliest date to start pulling tweets from") - end_date: Union[datetime, None] = Field(None,description="Latest date to pull tweets from, None if pulling up to the present") - has_link: bool = Field(False,description="Whether to query for tweets that have a link.") + subject: str = Field(..., description="Subject to search for") + min_likes: Union[int, None] = Field( + None, description="Minimum amount of likes on the tweet" + ) + max_likes: Union[int, None] = Field( + None, description="Maximum amount of likes on the tweet" + ) + start_date: Union[datetime, None] = Field( + None, description="Earliest date to start pulling tweets from" + ) + end_date: Union[datetime, None] = Field( + None, + description="Latest date to pull tweets from, None if pulling up to the present", + ) + has_link: bool = Field( + False, description="Whether to query for tweets that have a link." + ) + class BlogQuery(BaseModel): """Query against blog posts""" - subject: Union[str, None] = Field(...,description="Subject to search for") - authors: Union[None, str, list[str]] = Field(None,description="Authors to search for. None if not searching for a speific author, list if searching for more than one.") - start_date: Union[datetime, None] = Field(None, description="Earliest date to start pulling blog posts from") - end_date: Union[datetime, None] = Field(None,description="Latest date to pull blog posts from") + subject: Union[str, None] = Field(..., description="Subject to search for") + authors: Union[None, str, list[str]] = Field( + None, + description="Authors to search for. None if not searching for a speific author, list if searching for more than one.", + ) + start_date: Union[datetime, None] = Field( + None, description="Earliest date to start pulling blog posts from" + ) + end_date: Union[datetime, None] = Field( + None, description="Latest date to pull blog posts from" + ) + def get_environment() -> ToolUsageEnvironment: """Create an environment.""" tools = cast( List[BaseTool], - [ - tool(func) - for func in [ - TweetQuery, - DocQuery, - BlogQuery - ] - ], + [tool(func) for func in [TweetQuery, DocQuery, BlogQuery]], ) return ToolUsageEnvironment( tools=tools, @@ -68,132 +69,360 @@ def get_environment() -> ToolUsageEnvironment: DOC_DATASET = [ { - "question":[HumanMessage("How do I use the langgraph Send method?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'Send method','source':'langgraph'}}] + "question": [HumanMessage("How do I use the langgraph Send method?")], + "tool_calls": [ + { + "name": "DocQuery", + "args": {"query": "Send method", "source": "langgraph"}, + } + ], }, { - "question":[HumanMessage("How do you chain a prompt with a model?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'chaining prompt and model','source':'langchain'}}] + "question": [HumanMessage("How do you chain a prompt with a model?")], + "tool_calls": [ + { + "name": "DocQuery", + "args": {"query": "chaining prompt and model", "source": "langchain"}, + } + ], }, { - "question":[HumanMessage("How do you run a pairwise experiment in langsmith?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'pairwise experiment','source':'langsmith'}}] + "question": [ + HumanMessage("How do you run a pairwise experiment in langsmith?") + ], + "tool_calls": [ + { + "name": "DocQuery", + "args": {"query": "pairwise experiment", "source": "langsmith"}, + } + ], }, { - "question":[HumanMessage("What is a tool node?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'tool node','source':'langgraph'}}] + "question": [HumanMessage("What is a tool node?")], + "tool_calls": [ + {"name": "DocQuery", "args": {"query": "tool node", "source": "langgraph"}} + ], }, { - "question":[HumanMessage("How do I get the log probabilities of my chat model?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'log probabilities','source':'langchain'}}] + "question": [ + HumanMessage("How do I get the log probabilities of my chat model?") + ], + "tool_calls": [ + { + "name": "DocQuery", + "args": {"query": "log probabilities", "source": "langchain"}, + } + ], }, { - "question":[HumanMessage("How can I build my own custom evaluator?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'custom evaluator','source':'langsmith'}}] + "question": [HumanMessage("How can I build my own custom evaluator?")], + "tool_calls": [ + { + "name": "DocQuery", + "args": {"query": "custom evaluator", "source": "langsmith"}, + } + ], }, { - "question":[HumanMessage("How do I use a tool in a routing function?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'tool in routing function','source':'langgraph'}}] + "question": [HumanMessage("How do I use a tool in a routing function?")], + "tool_calls": [ + { + "name": "DocQuery", + "args": {"query": "tool in routing function", "source": "langgraph"}, + } + ], }, { - "question":[HumanMessage("How do use Pinecone as a vectorstore for few shot prompting?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'Pinecone for few shot prompting','source':'langchain'}}] + "question": [ + HumanMessage("How do use Pinecone as a vectorstore for few shot prompting?") + ], + "tool_calls": [ + { + "name": "DocQuery", + "args": { + "query": "Pinecone for few shot prompting", + "source": "langchain", + }, + } + ], }, { - "question":[HumanMessage("How do I prevent personal data from being logged in my traces?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'personal data logging','source':'langsmith'}}] + "question": [ + HumanMessage( + "How do I prevent personal data from being logged in my traces?" + ) + ], + "tool_calls": [ + { + "name": "DocQuery", + "args": {"query": "personal data logging", "source": "langsmith"}, + } + ], }, { - "question":[HumanMessage("How do you use a nested graph? Can you stream messages from inside them?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'nested graph','source':'langgraph'}}, - {'name': 'DocQuery', - 'args': {'query': 'stream messages nested graph','source':'langgraph'}}] + "question": [ + HumanMessage( + "How do you use a nested graph? Can you stream messages from inside them?" + ) + ], + "tool_calls": [ + { + "name": "DocQuery", + "args": {"query": "nested graph", "source": "langgraph"}, + }, + { + "name": "DocQuery", + "args": { + "query": "stream messages nested graph", + "source": "langgraph", + }, + }, + ], }, { - "question":[HumanMessage("How do I extract text from PDF data for my retrieval chain? Can I combine image and text in a prompt?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'PDF extraction for chain','source':'langchain'}}, - {'name': 'DocQuery', - 'args': {'query': 'multimodal prompt','source':'langchain'}}] + "question": [ + HumanMessage( + "How do I extract text from PDF data for my retrieval chain? Can I combine image and text in a prompt?" + ) + ], + "tool_calls": [ + { + "name": "DocQuery", + "args": {"query": "PDF extraction for chain", "source": "langchain"}, + }, + { + "name": "DocQuery", + "args": {"query": "multimodal prompt", "source": "langchain"}, + }, + ], }, { - "question":[HumanMessage("How do I setup automation rules for my traces? How do I view logs for those rules?")], - "tool_calls":[{'name': 'DocQuery', - 'args': {'query': 'automation rules for traces','source':'langsmith'}}, - {'name': 'DocQuery', - 'args': {'query': 'automation rules logs','source':'langsmith'}}] + "question": [ + HumanMessage( + "How do I setup automation rules for my traces? How do I view logs for those rules?" + ) + ], + "tool_calls": [ + { + "name": "DocQuery", + "args": {"query": "automation rules for traces", "source": "langsmith"}, + }, + { + "name": "DocQuery", + "args": {"query": "automation rules logs", "source": "langsmith"}, + }, + ], }, ] TWEET_DATASET = [ { - "question":[HumanMessage("Did we have any tweets about agents with more than 1000 likes that also included a link?")], - "tool_calls":[{'name': 'TweetQuery', - 'args': {'subject': 'agents','min_likes':1000,'max_likes':None,"start_date":None,"end_date":None,"has_link":True}}] + "question": [ + HumanMessage( + "Did we have any tweets about agents with more than 1000 likes that also included a link?" + ) + ], + "tool_calls": [ + { + "name": "TweetQuery", + "args": { + "subject": "agents", + "min_likes": 1000, + "max_likes": None, + "start_date": None, + "end_date": None, + "has_link": True, + }, + } + ], }, { - "question":[HumanMessage("Are there any tweets about evaluators by langchain with less than 100 likes?")], - "tool_calls":[{'name': 'TweetQuery', - 'args': {'subject': 'evaluators','min_likes':None,'max_likes':100,"start_date":None,"end_date":None,"has_link":False}}] + "question": [ + HumanMessage( + "Are there any tweets about evaluators by langchain with less than 100 likes?" + ) + ], + "tool_calls": [ + { + "name": "TweetQuery", + "args": { + "subject": "evaluators", + "min_likes": None, + "max_likes": 100, + "start_date": None, + "end_date": None, + "has_link": False, + }, + } + ], }, { - "question":[HumanMessage("Are there any tweets that link to the anthropic website in the last year?")], - "tool_calls":[{'name': 'TweetQuery', - 'args': {'subject': 'anthropic','min_likes':None,'max_likes':None,"start_date":datetime(2023,1,1),"end_date":None,"has_link":True}}] + "question": [ + HumanMessage( + "Are there any tweets that link to the anthropic website in the last year?" + ) + ], + "tool_calls": [ + { + "name": "TweetQuery", + "args": { + "subject": "anthropic", + "min_likes": None, + "max_likes": None, + "start_date": datetime(2023, 1, 1), + "end_date": None, + "has_link": True, + }, + } + ], }, { - "question":[HumanMessage("In Q2 2023 did we tweet anything about LangSmith?")], - "tool_calls":[{'name': 'TweetQuery', - 'args': {'subject': 'LangSmith','min_likes':None,'max_likes':None,"start_date":datetime(2023,3,1),"end_date":datetime(2023,6,1),"has_link":False}}] + "question": [HumanMessage("In Q2 2023 did we tweet anything about LangSmith?")], + "tool_calls": [ + { + "name": "TweetQuery", + "args": { + "subject": "LangSmith", + "min_likes": None, + "max_likes": None, + "start_date": datetime(2023, 3, 1), + "end_date": datetime(2023, 6, 1), + "has_link": False, + }, + } + ], }, { - "question":[HumanMessage("Were there any social media posts with triple digit likes about few shot prompting?")], - "tool_calls":[{'name': 'TweetQuery', - 'args': {'subject': 'few shot prompting','min_likes':100,'max_likes':999,"start_date":None,"end_date":None,"has_link":False}}] + "question": [ + HumanMessage( + "Were there any social media posts with triple digit likes about few shot prompting?" + ) + ], + "tool_calls": [ + { + "name": "TweetQuery", + "args": { + "subject": "few shot prompting", + "min_likes": 100, + "max_likes": 999, + "start_date": None, + "end_date": None, + "has_link": False, + }, + } + ], }, { - "question":[HumanMessage("Are there any posts aout LangServe before June 2023 that have more than 2000 likes and include a link?")], - "tool_calls":[{'name': 'TweetQuery', - 'args': {'subject': 'LangServe','min_likes':2000,'max_likes':None,"start_date":None,"end_date":datetime(2023,5,31),"has_link":True}}] + "question": [ + HumanMessage( + "Are there any posts aout LangServe before June 2023 that have more than 2000 likes and include a link?" + ) + ], + "tool_calls": [ + { + "name": "TweetQuery", + "args": { + "subject": "LangServe", + "min_likes": 2000, + "max_likes": None, + "start_date": None, + "end_date": datetime(2023, 5, 31), + "has_link": True, + }, + } + ], }, ] BLOG_DATASET = [ { - "question":[HumanMessage("what are some blog posts in the past year about agents?")], - "tool_calls":[{'name': 'BlogQuery', - 'args': {'subject': 'agents','authors':None,"start_date":datetime(2023,1,1),"end_date":None}}] + "question": [ + HumanMessage("what are some blog posts in the past year about agents?") + ], + "tool_calls": [ + { + "name": "BlogQuery", + "args": { + "subject": "agents", + "authors": None, + "start_date": datetime(2023, 1, 1), + "end_date": None, + }, + } + ], }, { - "question":[HumanMessage("how many blogs mentioned chat-gpt in the month after October 2023?")], - "tool_calls":[{'name': 'BlogQuery', - 'args': {'subject': 'chat gpt','authors':None,"start_date":datetime(2023,11,1),"end_date":datetime(2023,11,30)}}] + "question": [ + HumanMessage( + "how many blogs mentioned chat-gpt in the month after October 2023?" + ) + ], + "tool_calls": [ + { + "name": "BlogQuery", + "args": { + "subject": "chat gpt", + "authors": None, + "start_date": datetime(2023, 11, 1), + "end_date": datetime(2023, 11, 30), + }, + } + ], }, { - "question":[HumanMessage("what has Bagatur written about universal configurable models?")], - "tool_calls":[{'name': 'BlogQuery', - 'args': {'subject': 'universal configurable model','authors':"Bagatur","start_date":None,"end_date":None}}] + "question": [ + HumanMessage( + "what has Bagatur written about universal configurable models?" + ) + ], + "tool_calls": [ + { + "name": "BlogQuery", + "args": { + "subject": "universal configurable model", + "authors": "Bagatur", + "start_date": None, + "end_date": None, + }, + } + ], }, { - "question":[HumanMessage("Have Harrison or Bagatur written anything about passing in runnables as tools in the last week?")], - "tool_calls":[{'name': 'BlogQuery', - 'args': {'subject': 'runnables as tools','authors':["Harrison","Bagatur"],"start_date":datetime(2023,12,24),"end_date":None}}] + "question": [ + HumanMessage( + "Have Harrison or Bagatur written anything about passing in runnables as tools in the last week?" + ) + ], + "tool_calls": [ + { + "name": "BlogQuery", + "args": { + "subject": "runnables as tools", + "authors": ["Harrison", "Bagatur"], + "start_date": datetime(2023, 12, 24), + "end_date": None, + }, + } + ], }, { - "question":[HumanMessage("Did Harrison write anything about LangGraph in the time frame up to the end of Q1 2023?")], - "tool_calls":[{'name': 'BlogQuery', - 'args': {'subject': 'LangGraph','authors':"Harrison","start_date":None,"end_date":datetime(2023,3,1)}}] - } + "question": [ + HumanMessage( + "Did Harrison write anything about LangGraph in the time frame up to the end of Q1 2023?" + ) + ], + "tool_calls": [ + { + "name": "BlogQuery", + "args": { + "subject": "LangGraph", + "authors": "Harrison", + "start_date": None, + "end_date": datetime(2023, 3, 1), + }, + } + ], + }, ] DATASET = DOC_DATASET + TWEET_DATASET + BLOG_DATASET @@ -202,8 +431,14 @@ def get_environment() -> ToolUsageEnvironment: name="Extraction Task", dataset_id="https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d", create_environment=get_environment, - instructions=( - "You are requested to solve math questions in an alternate" + instructions=(""" + You are requested to generate queries for searching either through tweets, docs, or blog entries. + Inside the docs there are three different sources that you may wish to query for: LangGraph, LangSmith, or LangChain. + LangGraph is a library for building stateful, multi-actor applications with LLMs, used to create agent and multi-agent workflows. + LangSmith is an all-in-one developer platform for every step of the LLM-powered application lifecycle. + It helps you debug, evaluate, test, and monitor your LLM applications. LangChain is a framework to build with LLMs by chaining interoperable components. + One last important thing to remember is that some queries will ask for date ranges, and you must remember that today is 2024-01-01. + """ ), description=( """\ @@ -221,6 +456,7 @@ def get_environment() -> ToolUsageEnvironment: }, ) + def _create_dataset() -> None: """Create a dataset with the langsmith client.""" from langsmith.client import Client @@ -238,8 +474,4 @@ def _create_dataset() -> None: "reference": example["tool_calls"], }, dataset_id=dataset_id, - ) - -if __name__=="__main__": - #_create_dataset() - pass \ No newline at end of file + ) \ No newline at end of file diff --git a/scripts/extract_task.py b/scripts/extract_task.py index 029da94..0f83f6a 100644 --- a/scripts/extract_task.py +++ b/scripts/extract_task.py @@ -1,30 +1,57 @@ -from langsmith.evaluation import evaluate -from langchain_benchmarks.tool_usage.tasks.extraction_query import * -from langchain_openai import ChatOpenAI -from langchain_core.prompts import ChatPromptTemplate -from langchain_core.output_parsers import StrOutputParser import json +import sys +sys.path.append("../langchain_benchmarks") +from tool_usage.tasks.extraction_query import * +from datetime import datetime +from langchain_core.output_parsers import StrOutputParser +from langchain_core.prompts import ChatPromptTemplate +from langchain_openai import ChatOpenAI +from langsmith.evaluation import evaluate +from model_registration import model_registry +from langsmith.evaluation.evaluator import ( + EvaluationResult, + EvaluationResults, +) +from langsmith.schemas import Example, Run +from typing import Optional + + llm = ChatOpenAI(model="gpt-4o") +#TODO: system prompt llm_judge = ChatOpenAI(model="gpt-4o") -judge_prompt = ChatPromptTemplate.from_messages([ - ("system","You are an llm tasked with determining if the subject extracted by another LLM is an accurate " - "representation of the correct answer. You are to check for general semantic similarity since the words might not " - "match up perfectly but the meaning might still be the same. Return YES if the answers match, and NO otherwise. " - "Never return anything other than YES or NO."), - ("human","Is this query: {run_query} very similar to this reference query: {reference_query}") -]) +judge_prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are an llm tasked with determining if the subject extracted by another LLM is an accurate " + "representation of the correct answer. You are to check for general semantic similarity since the words might not " + "match up perfectly but the meaning might still be the same. Return YES if the answers match, and NO otherwise. " + "Never return anything other than YES or NO.", + ), + ( + "human", + "Is this query: {run_query} very similar to this reference query: {reference_query}", + ), + ] +) judge_chain = judge_prompt | llm_judge | StrOutputParser() -tools = [DocQuery,TweetQuery,BlogQuery] +tools = [DocQuery, TweetQuery, BlogQuery] llm = llm.bind_tools(tools) def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResults: # Chose the correct tool - correct_tool_score = int([tool['name'] for tool in example_outputs['reference']] == [tool['function']["name"] for tool in run_outputs['response'].additional_kwargs['tool_calls']]) + correct_tool_score = int( + [tool["name"] for tool in example_outputs["reference"]] + == [ + tool["function"]["name"] + for tool in run_outputs["response"].additional_kwargs["tool_calls"] + ] + ) # Has the correct determenistic args determinstic_score = 0 @@ -33,18 +60,44 @@ def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResul if correct_tool_score == 1: determinstic_score, underministic_score = 1, 1 - for tool in example_outputs['reference']: - corresponding_response_tool = json.loads([t['function'] for t in run_outputs['response'].additional_kwargs['tool_calls'] if t['function']["name"]==tool["name"]][0]['arguments']) - for arg in tool['args']: - if arg in ['query','subject']: - ans = judge_chain.invoke({"run_query":corresponding_response_tool[arg],"reference_query":tool['args'][arg]}) + for tool in example_outputs["reference"]: + corresponding_response_tool = json.loads( + [ + t["function"] + for t in run_outputs["response"].additional_kwargs["tool_calls"] + if t["function"]["name"] == tool["name"] + ][0]["arguments"] + ) + for arg in tool["args"]: + if arg in ["query", "subject"]: + ans = judge_chain.invoke( + { + "run_query": corresponding_response_tool[arg], + "reference_query": tool["args"][arg], + } + ) underministic_score = 1 if ans == "YES" else 0 else: - if (tool['args'][arg] and arg not in corresponding_response_tool) or (tool['args'][arg] and not (tool['args'][arg] == corresponding_response_tool[arg]) and \ - not (isinstance(tool['args'][arg],datetime) and datetime.fromisoformat((corresponding_response_tool[arg])).replace(tzinfo=None) == tool['args'][arg])): + if ( + tool["args"][arg] and arg not in corresponding_response_tool + ) or ( + tool["args"][arg] + and not (tool["args"][arg] == corresponding_response_tool[arg]) + and not ( + isinstance(tool["args"][arg], datetime) + and datetime.fromisoformat( + (corresponding_response_tool[arg]) + ).replace(tzinfo=None) + == tool["args"][arg] + ) + ): determinstic_score = 0 # Overall correctness - overall_score = int(bool(correct_tool_score) and bool(determinstic_score) and bool(underministic_score)) + overall_score = int( + bool(correct_tool_score) + and bool(determinstic_score) + and bool(underministic_score) + ) results = [ EvaluationResult( key="Correct tool", @@ -61,24 +114,51 @@ def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResul EvaluationResult( key="Overall correctness", score=overall_score, - ) + ), ] - return {"results":results} + return {"results": results} + -def evaluate_run( - run: Run, example: Optional[Example] = None - ) -> EvaluationResults: - return compare_outputs( - run.outputs, - example.outputs) +def evaluate_run(run: Run, example: Optional[Example] = None) -> EvaluationResults: + return compare_outputs(run.outputs, example.outputs) -def predict(example: dict): - return {"response":llm.invoke(example['question'])} +instructions = "" +ChatPromptTemplate.from_messages( + [ + ("system", "{instructions}"), + ("human", "{input}"), + ] +) + +prompt = ChatPromptTemplate.from_messages( + [ + ("system", "{instructions}"), + ("human", "{input}"), + ] +) +def predict_for_model(model,instructions): + chain = prompt | model.bind_tools(tools) + def predict(example: dict): + return {"response": chain.invoke({"input":example["question"],"instructions":instructions})} + return predict +''' experiment_results = evaluate( - predict, + predict, + data=EXTRACTION_TASK.name, + evaluators=[evaluate_run], + experiment_prefix="test-single-tool", +)''' + +models = ["gpt-4o", "firefunction-v2"] + +for model_name in models: + model = model_registry[model_name].get_model() + evaluate( + predict_for_model(model,EXTRACTION_TASK.instructions), data=EXTRACTION_TASK.name, evaluators=[evaluate_run], - experiment_prefix="test-single-tool", - ) \ No newline at end of file + experiment_prefix=f"test-{model_name}", + ) +