Skip to content

Commit

Permalink
Add registry object, add eval notebook (#25)
Browse files Browse the repository at this point in the history
Add registry object
Add eval notebook
  • Loading branch information
eyurtsev authored Nov 17, 2023
1 parent d958280 commit 65aeb98
Show file tree
Hide file tree
Showing 10 changed files with 487 additions and 6 deletions.
12 changes: 9 additions & 3 deletions docs/source/notebooks/datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@
}
],
"source": [
"download_public_dataset('https://api.smith.langchain.com/public/e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5/examples')"
"download_public_dataset(\n",
" \"https://api.smith.langchain.com/public/e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5/examples\"\n",
")"
]
},
{
Expand Down Expand Up @@ -139,7 +141,8 @@
],
"source": [
"import json\n",
"with open('./e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5.json', 'r', encoding='utf-8') as f:\n",
"\n",
"with open(\"./e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5.json\", \"r\", encoding=\"utf-8\") as f:\n",
" print(json.dumps(json.load(f)[:2], indent=2, sort_keys=True))"
]
},
Expand Down Expand Up @@ -169,7 +172,10 @@
},
"outputs": [],
"source": [
"clone_public_dataset('https://api.smith.langchain.com/public/e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5/examples', dataset_name='Agent Dataset')"
"clone_public_dataset(\n",
" \"https://api.smith.langchain.com/public/e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5/examples\",\n",
" dataset_name=\"Agent Dataset\",\n",
")"
]
}
],
Expand Down
341 changes: 341 additions & 0 deletions docs/source/notebooks/tool_usage.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,341 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
"metadata": {},
"source": [
"# Tool Usage\n",
"\n",
"Let's see how to evaluate an agent's ability to use tools."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks.tool_usage import registry\n",
"from langchain_benchmarks import clone_public_dataset"
]
},
{
"cell_type": "markdown",
"id": "03488ab1-31ed-41c2-8da2-46b02599b181",
"metadata": {},
"source": [
"For this code to work, please configure LangSmith environment variables with your credentials."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3644d211-382e-41aa-b282-21b01d28fc35",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<thead>\n",
"<tr><th style=\"text-align: right;\"> ID</th><th>Name </th><th>Dataset ID </th><th>Description </th></tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr><td style=\"text-align: right;\"> 0</td><td>Tool Usage - Alpha</td><td>e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5</td><td>Environment with fake data about users and their locations and favorite foods.\n",
"\n",
"The environment provides a set of tools that can be used to query the data.\n",
"\n",
"The object is to evaluate the ability of an agent to use the tools\n",
"to answer questions about the data.\n",
"\n",
"The dataset contains 21 examples of varying difficulty. The difficulty is measured\n",
"by the number of tools that need to be used to answer the question.\n",
"\n",
"Each example is composed of a question, a reference answer, and\n",
"information about the sequence in which tools should be used to answer\n",
"the question.\n",
"\n",
"Success is measured by the ability to answer the question correctly, and efficiently. </td></tr>\n",
"</tbody>\n",
"</table>"
],
"text/plain": [
"Registry(environments=[Environment(id=0, name='Tool Usage - Alpha', dataset_id='e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5', tools_factory=<function get_tools at 0x7f944560b0d0>, description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe object is to evaluate the ability of an agent to use the tools\\nto answer questions about the data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured\\nby the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and\\ninformation about the sequence in which tools should be used to answer\\nthe question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n')])"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"registry"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "671282f8-c455-4390-b018-e53bbd833093",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"alpha = registry[0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "cf6dca5d-63cf-4315-8206-726abe816473",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<tbody>\n",
"<tr><td>ID </td><td>0 </td></tr>\n",
"<tr><td>Name </td><td>Tool Usage - Alpha </td></tr>\n",
"<tr><td>Dataset ID </td><td>e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5</td></tr>\n",
"<tr><td>Description</td><td>Environment with fake data about users and their locations and favorite foods.\n",
"\n",
"The environment prov... </td></tr>\n",
"</tbody>\n",
"</table>"
],
"text/plain": [
"Environment(id=0, name='Tool Usage - Alpha', dataset_id='e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5', tools_factory=<function get_tools at 0x7f944560b0d0>, description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe object is to evaluate the ability of an agent to use the tools\\nto answer questions about the data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured\\nby the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and\\ninformation about the sequence in which tools should be used to answer\\nthe question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"alpha"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "70369f67-deb4-467a-801a-6d38c3d0460d",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset Tool Usage - Alpha already exists. Skipping.\n",
"You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/9b745a89-c06a-4602-a258-f94e9e292dde.\n"
]
}
],
"source": [
"clone_public_dataset(alpha.dataset_id, dataset_name=alpha.name)"
]
},
{
"cell_type": "markdown",
"id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
"metadata": {},
"source": [
"## Define an agent\n",
"\n",
"Let's build an agent that we can use for evaluation."
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "b30a36a8-a56b-4767-b233-2841747b769a",
"metadata": {},
"outputs": [],
"source": [
"from langchain.schema.runnable import RunnablePassthrough\n",
"from dateutil.parser import parse\n",
"from langchain.agents import AgentExecutor\n",
"from langchain.agents.format_scratchpad import format_to_openai_functions\n",
"from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
"from langchain.tools.render import format_tool_to_openai_function\n",
"\n",
"\n",
"TOOLS = alpha.tools_factory()\n",
"\n",
"\n",
"def agent_factory() -> AgentExecutor:\n",
" \"\"\"Agent Executor\"\"\"\n",
" llm = ChatOpenAI(\n",
" model=\"gpt-3.5-turbo-16k\",\n",
" temperature=0,\n",
" )\n",
"\n",
" llm_with_tools = llm.bind(\n",
" functions=[format_tool_to_openai_function(t) for t in TOOLS]\n",
" )\n",
" prompt = ChatPromptTemplate.from_messages(\n",
" [\n",
" (\n",
" \"system\",\n",
" \"You are a helpful assistant. Use the given tools to answer the question. Keep in mind that an ID is distinct from a name for every entity.\",\n",
" ),\n",
" MessagesPlaceholder(variable_name=\"agent_scratchpad\"),\n",
" (\"user\", \"{input}\"),\n",
" ]\n",
" )\n",
"\n",
" runnable_agent = (\n",
" {\n",
" \"input\": lambda x: x[\"question\"],\n",
" \"agent_scratchpad\": lambda x: format_to_openai_functions(\n",
" x[\"intermediate_steps\"]\n",
" ),\n",
" }\n",
" | prompt\n",
" | llm_with_tools\n",
" | OpenAIFunctionsAgentOutputParser()\n",
" )\n",
"\n",
" def _ensure_output_exists(inputs):\n",
" \"\"\"Make sure that the output key is always present.\"\"\"\n",
" if \"output\" not in inputs:\n",
" return {\"output\": \"\", **inputs}\n",
" return inputs\n",
"\n",
" return (\n",
" AgentExecutor(\n",
" agent=runnable_agent,\n",
" tools=TOOLS,\n",
" handle_parsing_errors=True,\n",
" return_intermediate_steps=True,\n",
" )\n",
" | _ensure_output_exists\n",
" )"
]
},
{
"cell_type": "markdown",
"id": "87a64f76-65ae-4367-b43f-f2be3431e7af",
"metadata": {},
"source": [
"Let's test that our agent works"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "0e4896fa-3633-44a1-857f-80a263cf2e03",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'question': 'who is bob?',\n",
" 'output': 'Bob is a user with the ID 21.',\n",
" 'intermediate_steps': [(AgentActionMessageLog(tool='find_users_by_name', tool_input={'name': 'bob'}, log=\"\\nInvoking: `find_users_by_name` with `{'name': 'bob'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n\"name\": \"bob\"\\n}', 'name': 'find_users_by_name'}})]),\n",
" [{'id': 21, 'name': 'Bob'},\n",
" {'id': 41, 'name': 'Donna'},\n",
" {'id': 1, 'name': 'Alice'},\n",
" {'id': 35, 'name': 'Charlie'},\n",
" {'id': 42, 'name': 'Eve'},\n",
" {'id': 43, 'name': 'Frank The Cat'}]),\n",
" (AgentActionMessageLog(tool='get_user_name', tool_input={'user_id': 21}, log=\"\\nInvoking: `get_user_name` with `{'user_id': 21}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n\"user_id\": 21\\n}', 'name': 'get_user_name'}})]),\n",
" 'Bob')]}"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent_factory().invoke({\"question\": \"who is bob?\"})"
]
},
{
"cell_type": "markdown",
"id": "3821e4b0-8e67-418a-840c-470fcde42df0",
"metadata": {},
"source": [
"## Eval\n",
"\n",
"Let's evaluate an agent now"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "513042fe-2878-44f8-ae84-05b9d521c1de",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
"from langsmith.client import Client"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "2bedd9d1-fc06-4066-9f89-b874ae818d82",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"client = Client()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aab7514e-a6ef-4c21-b90f-d9cbefcf5af1",
"metadata": {},
"outputs": [],
"source": [
"test_run = client.run_on_dataset(\n",
" dataset_name=alpha.name,\n",
" llm_or_chain_factory=agent_factory,\n",
" evaluation=STANDARD_AGENT_EVALUATOR,\n",
" verbose=True,\n",
" tags=[\"openai-functions\"],\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
5 changes: 5 additions & 0 deletions langchain_benchmarks/tool_usage/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
"""Package for helping to evaluate agent runs."""
from langchain_benchmarks.tool_usage.evaluators import STANDARD_AGENT_EVALUATOR
from langchain_benchmarks.tool_usage.registration import registry

# Please keep this list sorted!
__all__ = ["registry", "STANDARD_AGENT_EVALUATOR"]
Original file line number Diff line number Diff line change
Expand Up @@ -398,4 +398,4 @@ def get_tools() -> List[BaseTool]:


# ID of a dataset that contains the questions and references
DATASET_ID = "9f73165c-d333-4d14-8f59-bd7eede5db08" # ID of Agent Gym: E01 Alpha
DATASET_ID = "e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5" # ID of Agent Gym: E01 Alpha
1 change: 1 addition & 0 deletions langchain_benchmarks/tool_usage/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,5 @@ def evaluate_run(
custom_evaluators=[AgentTrajectoryEvaluator()],
# We now need to specify this because we have multiple outputs in our dataset
reference_key="reference",
prediction_key="output",
)
Loading

0 comments on commit 65aeb98

Please sign in to comment.