Add registry object, add eval notebook (#25)

Add registry object Add eval notebook
langchain-ai · Nov 17, 2023 · 65aeb98 · 65aeb98
1 parent d958280
commit 65aeb98
Show file tree

Hide file tree

Showing 10 changed files with 487 additions and 6 deletions.
diff --git a/docs/source/notebooks/datasets.ipynb b/docs/source/notebooks/datasets.ipynb
@@ -68,7 +68,9 @@
     }
    ],
    "source": [
-    "download_public_dataset('https://api.smith.langchain.com/public/e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5/examples')"
+    "download_public_dataset(\n",
+    "    \"https://api.smith.langchain.com/public/e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5/examples\"\n",
+    ")"
    ]
   },
   {
@@ -139,7 +141,8 @@
    ],
    "source": [
     "import json\n",
-    "with open('./e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5.json', 'r', encoding='utf-8') as f:\n",
+    "\n",
+    "with open(\"./e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5.json\", \"r\", encoding=\"utf-8\") as f:\n",
     "    print(json.dumps(json.load(f)[:2], indent=2, sort_keys=True))"
    ]
   },
@@ -169,7 +172,10 @@
    },
    "outputs": [],
    "source": [
-    "clone_public_dataset('https://api.smith.langchain.com/public/e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5/examples', dataset_name='Agent Dataset')"
+    "clone_public_dataset(\n",
+    "    \"https://api.smith.langchain.com/public/e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5/examples\",\n",
+    "    dataset_name=\"Agent Dataset\",\n",
+    ")"
    ]
   }
  ],

diff --git a/docs/source/notebooks/tool_usage.ipynb b/docs/source/notebooks/tool_usage.ipynb
@@ -0,0 +1,341 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
+   "metadata": {},
+   "source": [
+    "# Tool Usage\n",
+    "\n",
+    "Let's see how to evaluate an agent's ability to use tools."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_benchmarks.tool_usage import registry\n",
+    "from langchain_benchmarks import clone_public_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "03488ab1-31ed-41c2-8da2-46b02599b181",
+   "metadata": {},
+   "source": [
+    "For this code to work, please configure LangSmith environment variables with your credentials."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3644d211-382e-41aa-b282-21b01d28fc35",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "<thead>\n",
+       "<tr><th style=\"text-align: right;\">  ID</th><th>Name              </th><th>Dataset ID                          </th><th>Description  </th></tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr><td style=\"text-align: right;\">   0</td><td>Tool Usage - Alpha</td><td>e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5</td><td>Environment with fake data about users and their locations and favorite foods.\n",
+       "\n",
+       "The environment provides a set of tools that can be used to query the data.\n",
+       "\n",
+       "The object is to evaluate the ability of an agent to use the tools\n",
+       "to answer questions about the data.\n",
+       "\n",
+       "The dataset contains 21 examples of varying difficulty. The difficulty is measured\n",
+       "by the number of tools that need to be used to answer the question.\n",
+       "\n",
+       "Each example is composed of a question, a reference answer, and\n",
+       "information about the sequence in which tools should be used to answer\n",
+       "the question.\n",
+       "\n",
+       "Success is measured by the ability to answer the question correctly, and efficiently.              </td></tr>\n",
+       "</tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "Registry(environments=[Environment(id=0, name='Tool Usage - Alpha', dataset_id='e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5', tools_factory=<function get_tools at 0x7f944560b0d0>, description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe object is to evaluate the ability of an agent to use the tools\\nto answer questions about the data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured\\nby the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and\\ninformation about the sequence in which tools should be used to answer\\nthe question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n')])"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "registry"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "671282f8-c455-4390-b018-e53bbd833093",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "alpha = registry[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "cf6dca5d-63cf-4315-8206-726abe816473",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "<tbody>\n",
+       "<tr><td>ID         </td><td>0                                   </td></tr>\n",
+       "<tr><td>Name       </td><td>Tool Usage - Alpha                  </td></tr>\n",
+       "<tr><td>Dataset ID </td><td>e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5</td></tr>\n",
+       "<tr><td>Description</td><td>Environment with fake data about users and their locations and favorite foods.\n",
+       "\n",
+       "The environment prov...                                     </td></tr>\n",
+       "</tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "Environment(id=0, name='Tool Usage - Alpha', dataset_id='e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5', tools_factory=<function get_tools at 0x7f944560b0d0>, description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe object is to evaluate the ability of an agent to use the tools\\nto answer questions about the data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured\\nby the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and\\ninformation about the sequence in which tools should be used to answer\\nthe question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n')"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "alpha"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "70369f67-deb4-467a-801a-6d38c3d0460d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset Tool Usage - Alpha already exists. Skipping.\n",
+      "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/9b745a89-c06a-4602-a258-f94e9e292dde.\n"
+     ]
+    }
+   ],
+   "source": [
+    "clone_public_dataset(alpha.dataset_id, dataset_name=alpha.name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
+   "metadata": {},
+   "source": [
+    "## Define an agent\n",
+    "\n",
+    "Let's build an agent that we can use for evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "b30a36a8-a56b-4767-b233-2841747b769a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.schema.runnable import RunnablePassthrough\n",
+    "from dateutil.parser import parse\n",
+    "from langchain.agents import AgentExecutor\n",
+    "from langchain.agents.format_scratchpad import format_to_openai_functions\n",
+    "from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
+    "from langchain.tools.render import format_tool_to_openai_function\n",
+    "\n",
+    "\n",
+    "TOOLS = alpha.tools_factory()\n",
+    "\n",
+    "\n",
+    "def agent_factory() -> AgentExecutor:\n",
+    "    \"\"\"Agent Executor\"\"\"\n",
+    "    llm = ChatOpenAI(\n",
+    "        model=\"gpt-3.5-turbo-16k\",\n",
+    "        temperature=0,\n",
+    "    )\n",
+    "\n",
+    "    llm_with_tools = llm.bind(\n",
+    "        functions=[format_tool_to_openai_function(t) for t in TOOLS]\n",
+    "    )\n",
+    "    prompt = ChatPromptTemplate.from_messages(\n",
+    "        [\n",
+    "            (\n",
+    "                \"system\",\n",
+    "                \"You are a helpful assistant. Use the given tools to answer the question. Keep in mind that an ID is distinct from a name for every entity.\",\n",
+    "            ),\n",
+    "            MessagesPlaceholder(variable_name=\"agent_scratchpad\"),\n",
+    "            (\"user\", \"{input}\"),\n",
+    "        ]\n",
+    "    )\n",
+    "\n",
+    "    runnable_agent = (\n",
+    "        {\n",
+    "            \"input\": lambda x: x[\"question\"],\n",
+    "            \"agent_scratchpad\": lambda x: format_to_openai_functions(\n",
+    "                x[\"intermediate_steps\"]\n",
+    "            ),\n",
+    "        }\n",
+    "        | prompt\n",
+    "        | llm_with_tools\n",
+    "        | OpenAIFunctionsAgentOutputParser()\n",
+    "    )\n",
+    "\n",
+    "    def _ensure_output_exists(inputs):\n",
+    "        \"\"\"Make sure that the output key is always present.\"\"\"\n",
+    "        if \"output\" not in inputs:\n",
+    "            return {\"output\": \"\", **inputs}\n",
+    "        return inputs\n",
+    "\n",
+    "    return (\n",
+    "        AgentExecutor(\n",
+    "            agent=runnable_agent,\n",
+    "            tools=TOOLS,\n",
+    "            handle_parsing_errors=True,\n",
+    "            return_intermediate_steps=True,\n",
+    "        )\n",
+    "        | _ensure_output_exists\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "87a64f76-65ae-4367-b43f-f2be3431e7af",
+   "metadata": {},
+   "source": [
+    "Let's test that our agent works"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "0e4896fa-3633-44a1-857f-80a263cf2e03",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'who is bob?',\n",
+       " 'output': 'Bob is a user with the ID 21.',\n",
+       " 'intermediate_steps': [(AgentActionMessageLog(tool='find_users_by_name', tool_input={'name': 'bob'}, log=\"\\nInvoking: `find_users_by_name` with `{'name': 'bob'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n\"name\": \"bob\"\\n}', 'name': 'find_users_by_name'}})]),\n",
+       "   [{'id': 21, 'name': 'Bob'},\n",
+       "    {'id': 41, 'name': 'Donna'},\n",
+       "    {'id': 1, 'name': 'Alice'},\n",
+       "    {'id': 35, 'name': 'Charlie'},\n",
+       "    {'id': 42, 'name': 'Eve'},\n",
+       "    {'id': 43, 'name': 'Frank The Cat'}]),\n",
+       "  (AgentActionMessageLog(tool='get_user_name', tool_input={'user_id': 21}, log=\"\\nInvoking: `get_user_name` with `{'user_id': 21}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n\"user_id\": 21\\n}', 'name': 'get_user_name'}})]),\n",
+       "   'Bob')]}"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "agent_factory().invoke({\"question\": \"who is bob?\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
+   "metadata": {},
+   "source": [
+    "## Eval\n",
+    "\n",
+    "Let's evaluate an agent now"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "513042fe-2878-44f8-ae84-05b9d521c1de",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
+    "from langsmith.client import Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "2bedd9d1-fc06-4066-9f89-b874ae818d82",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "client = Client()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aab7514e-a6ef-4c21-b90f-d9cbefcf5af1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_run = client.run_on_dataset(\n",
+    "    dataset_name=alpha.name,\n",
+    "    llm_or_chain_factory=agent_factory,\n",
+    "    evaluation=STANDARD_AGENT_EVALUATOR,\n",
+    "    verbose=True,\n",
+    "    tags=[\"openai-functions\"],\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain_benchmarks/tool_usage/__init__.py b/langchain_benchmarks/tool_usage/__init__.py
@@ -1 +1,6 @@
 """Package for helping to evaluate agent runs."""
+from langchain_benchmarks.tool_usage.evaluators import STANDARD_AGENT_EVALUATOR
+from langchain_benchmarks.tool_usage.registration import registry
+
+# Please keep this list sorted!
+__all__ = ["registry", "STANDARD_AGENT_EVALUATOR"]
diff --git a/...arks/tool_usage/environments/e01_alpha.py → ...nchmarks/tool_usage/environments/alpha.py b/...arks/tool_usage/environments/e01_alpha.py → ...nchmarks/tool_usage/environments/alpha.py
@@ -398,4 +398,4 @@ def get_tools() -> List[BaseTool]:
 
 
 # ID of a dataset that contains the questions and references
-DATASET_ID = "9f73165c-d333-4d14-8f59-bd7eede5db08"  # ID of Agent Gym: E01 Alpha
+DATASET_ID = "e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5"  # ID of Agent Gym: E01 Alpha
diff --git a/langchain_benchmarks/tool_usage/evaluators.py b/langchain_benchmarks/tool_usage/evaluators.py
@@ -70,4 +70,5 @@ def evaluate_run(
     custom_evaluators=[AgentTrajectoryEvaluator()],
     # We now need to specify this because we have multiple outputs in our dataset
     reference_key="reference",
+    prediction_key="output",
 )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -398,4 +398,4 @@ def get_tools() -> List[BaseTool]:


		# ID of a dataset that contains the questions and references
		DATASET_ID = "9f73165c-d333-4d14-8f59-bd7eede5db08" # ID of Agent Gym: E01 Alpha
		DATASET_ID = "e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5" # ID of Agent Gym: E01 Alpha