diff --git a/docs/source/notebooks/tool_usage/intro.ipynb b/docs/source/notebooks/tool_usage/intro.ipynb
index 06d89d9..b228b8b 100644
--- a/docs/source/notebooks/tool_usage/intro.ipynb
+++ b/docs/source/notebooks/tool_usage/intro.ipynb
@@ -93,7 +93,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "3b9b82fc-b689-4a25-b718-99ecc2fc6867",
    "metadata": {
     "tags": []
@@ -136,19 +136,21 @@
        "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n",
        "\n",
        "Success is measured by the ability to answer the question correctly, and efficiently.              </td></tr>\n",
-       "<tr><td>Multiverse Math                   </td><td>ToolUsageTask</td><td><a href=\"https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d\" target=\"_blank\" rel=\"noopener\">594f9f60-30a0-49bf-b075-f44beabf546a</a></td><td>An environment that contains a few basic math operations, but with altered results.\n",
+       "<tr><td>Multiverse Math                   </td><td>ToolUsageTask</td><td><a href=\"https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d\" target=\"_blank\" rel=\"noopener\">47ed57bc-e852-4f84-a23e-cce4793864e9</a></td><td>An environment that contains a few basic math operations, but with altered results.\n",
        "\n",
        "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n",
        "\n",
-       "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.              </td></tr>\n",
+       "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\n",
+       "\n",
+       "This task is associated with 20 test examples.              </td></tr>\n",
        "</tbody>\n",
        "</table>"
       ],
       "text/plain": [
-       "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x12778be20>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x1277c0360>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=<function get_environment at 0x12778b920>, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\", eval_params={}), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x12778b240>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math'})])"
+       "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x7b3a9f5fad40>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x7b3a9f5fb240>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=<function get_environment at 0x7b3a9f5fa840>, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\", eval_params={}), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n\\nThis task is associated with 20 test examples.\\n', create_environment=<function get_environment at 0x7b3a9f5fa200>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math_without_question'})])"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -169,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "7543739b-d212-4249-9b4a-fc406a58c9c7",
    "metadata": {
     "tags": []
@@ -198,10 +200,10 @@
        "</table>"
       ],
       "text/plain": [
-       "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x1277c0360>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
+       "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x7b3a9f5fb240>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -248,7 +250,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "f201dbbe-7d92-4bc7-b4b5-ea8901dd2970",
    "metadata": {
     "tags": []
@@ -257,13 +259,13 @@
     {
      "data": {
       "text/plain": [
-       "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=<class 'pydantic.v1.main.aSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c18a0>),\n",
-       " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=<class 'pydantic.v1.main.bSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c13a0>),\n",
-       " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=<class 'pydantic.v1.main.cSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c19e0>),\n",
-       " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=<class 'pydantic.v1.main.dSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c1800>)]"
+       "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=<class 'pydantic.v1.main.aSchema'>, func=<function _create_typing_func.<locals>.func at 0x7b3a9f62c9a0>),\n",
+       " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=<class 'pydantic.v1.main.bSchema'>, func=<function _create_typing_func.<locals>.func at 0x7b3a9f62c5e0>),\n",
+       " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=<class 'pydantic.v1.main.cSchema'>, func=<function _create_typing_func.<locals>.func at 0x7b3a9f62cae0>),\n",
+       " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=<class 'pydantic.v1.main.dSchema'>, func=<function _create_typing_func.<locals>.func at 0x7b3a9f62cb80>)]"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -275,7 +277,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "b07957ee-ae52-47d4-a4ff-aa99d4d9bdaf",
    "metadata": {
     "tags": []
@@ -287,7 +289,7 @@
        "'OK'"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -300,7 +302,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "id": "40fbb9b6-00f6-4445-b480-00eed6b5b3aa",
    "metadata": {
     "tags": []
@@ -312,7 +314,7 @@
        "'aac'"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -326,134 +328,118 @@
    "id": "8d39b9b3-d4da-49bc-b3db-8a4165b1db55",
    "metadata": {},
    "source": [
-    "## Creating an agent\n",
+    "## Create an Agent!\n",
     "\n",
-    "So now that you know how the test environment works, it's time to define an agent! \n",
+    "Now that you know how the test environment works, let's create an agent that we can test!\n",
     "\n",
-    "We will follow the example in the LangChain documentation to [define an OpenAI tool using agent](https://python.langchain.com/docs/modules/agents/). "
+    "Because an agent interacts with the environment via tools and can change the state of the environment during the course of an agent run, what we actually want is the ability to create a fresh agent and a fresh environment for each test run.\n",
+    "\n",
+    "We'll do this using a factory. A factory is just a fancy name in computer science for an object that can create other objects. In this case, we'll have an Agent Factory that we can call and it'll create a fresh agent for us on each call.\n",
+    "\n",
+    "We'll use the StandardAgentFactory which under the hood creates a standard LangChain [tool calling agent](https://python.langchain.com/docs/modules/agents/agent_types/tool_calling/). It can be used with any [Chat Model that support tool calling](https://python.langchain.com/docs/integrations/chat/)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
-   "id": "8827186a-8ed3-43c7-956c-71342e0a7bf2",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 7,
+   "id": "db65c253-7710-4c7b-b968-0662ec089030",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "from langchain.agents.format_scratchpad.openai_tools import (\n",
-    "    format_to_openai_tool_messages,\n",
-    ")\n",
-    "from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser\n",
-    "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
-    "from langchain.tools.render import (\n",
-    "    format_tool_to_openai_function,\n",
-    "    format_tool_to_openai_tool,\n",
-    ")\n",
-    "from langchain_community.chat_models import ChatOpenAI\n",
-    "from langchain_core.runnables import RunnableParallel\n",
-    "\n",
-    "tools = task.create_environment().tools\n",
-    "formatted_tools = [format_tool_to_openai_tool(t) for t in tools]\n",
-    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-1106\", temperature=0, model_kwargs={\"seed\": 42})\n",
-    "# Compose the llm call with the tools' JSON schemas\n",
-    "llm_with_tools = llm.bind(tools=formatted_tools)\n",
-    "format_inputs = RunnableParallel(\n",
-    "    {\n",
-    "        \"input\": lambda x: x[\"input\"],\n",
-    "        \"agent_scratchpad\": lambda x: format_to_openai_tool_messages(\n",
-    "            x[\"intermediate_steps\"]\n",
-    "        ),\n",
-    "    }\n",
-    ")\n",
+    "from langchain_anthropic.chat_models import ChatAnthropic\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "\n",
+    "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n",
     "\n",
+    "model = ChatAnthropic(model=\"claude-3-opus-20240229\", temperature=0)\n",
     "prompt = ChatPromptTemplate.from_messages(\n",
     "    [\n",
+    "        (\"system\", \"{instructions}\"),  # Populated from task.instructions automatically\n",
     "        (\n",
-    "            \"system\",\n",
-    "            \"You are very powerful assistant, but bad at calculating lengths of words.\",\n",
-    "        ),\n",
-    "        (\"user\", \"{input}\"),\n",
-    "        MessagesPlaceholder(variable_name=\"agent_scratchpad\"),\n",
+    "            \"human\",\n",
+    "            \"{question}\",\n",
+    "        ),  # Each evaluation example is associated with a question\n",
+    "        (\"placeholder\", \"{agent_scratchpad}\"),  # Space for the agent to do work\n",
     "    ]\n",
     ")\n",
-    "agent_definition = (\n",
-    "    # Input to this pipeline is a dictionary with \"input\" and \"intermediate_steps\" keys\n",
-    "    format_inputs | prompt | llm_with_tools | OpenAIToolsAgentOutputParser()\n",
-    ")"
+    "\n",
+    "agent_factory = StandardAgentFactory(task, model, prompt)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "7614ab73-dc66-4f2e-9eeb-ff1711c113d0",
+   "id": "5c99a9bd-fa3e-4401-9062-77dbcff30d5c",
    "metadata": {},
    "source": [
-    "### Agent Factory\n",
-    "\n",
-    "As discussed above, each test environment tracks state. We want to create a new environment for each data point to avoid cross-contamination between rows in the dataset.\n",
-    "\n",
-    "We do this by defining an agent factory. Below, we integrate our agent into a `CustomRunnableAgentFactory`, which helps create the environment and agent executor for each data point."
+    "Here, were the instructions for the task"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "id": "629416b3-b5d6-45ad-9bda-4f0642a0eb13",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "execution_count": 9,
+   "id": "8e1f0a3d-fed6-41f7-8825-08787a57ad98",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\""
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "from langchain_benchmarks.tool_usage.agents import CustomRunnableAgentFactory\n",
-    "\n",
-    "agent_factory = CustomRunnableAgentFactory(task, agent=agent_definition)"
+    "task.instructions"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "7f06cf25-6766-4ea5-a566-36af045bdcf4",
+   "id": "82c9de5d-185b-4776-9ee9-112a2db32139",
    "metadata": {},
    "source": [
-    "Let's check that the agent works"
+    "Let's test it out"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
-   "id": "755f7920-831b-4595-8c6d-cca22c935198",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "execution_count": 10,
+   "id": "ce67d619-fa99-4c15-bc53-3fb08b40a201",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
+      "\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `a` with `{}`\n",
+      "responded: [{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_01MQ6oTx2j2uNGCR5LBVeKui', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01AytT1jvNNR67VodMkhbq7r', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_015VkTYUV5hWcobtduqssi9k', 'input': {}, 'name': 'c', 'type': 'tool_use'}]\n",
+      "\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `b` with `{}`\n",
+      "responded: [{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_01MQ6oTx2j2uNGCR5LBVeKui', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01AytT1jvNNR67VodMkhbq7r', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_015VkTYUV5hWcobtduqssi9k', 'input': {}, 'name': 'c', 'type': 'tool_use'}]\n",
+      "\n",
+      "\u001b[0m\u001b[33;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `c` with `{}`\n",
+      "responded: [{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_01MQ6oTx2j2uNGCR5LBVeKui', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01AytT1jvNNR67VodMkhbq7r', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_015VkTYUV5hWcobtduqssi9k', 'input': {}, 'name': 'c', 'type': 'tool_use'}]\n",
+      "\n",
+      "\u001b[0m\u001b[38;5;200m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m[]\u001b[0m\n",
+      "\n",
+      "\u001b[1m> Finished chain.\u001b[0m\n"
+     ]
+    }
+   ],
    "source": [
     "from langchain import globals\n",
     "\n",
-    "globals.set_verbose(True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c2804eae-5b0b-4a38-9dff-363a4fe8f324",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
+    "globals.set_verbose(True)\n",
     "agent = agent_factory()\n",
-    "agent.invoke({\"question\": \"abc\"})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "2aa68a11-d268-4868-a862-309801201989",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
+    "agent.invoke({\"question\": \"abc\"})\n",
     "globals.set_verbose(False)"
    ]
   },
@@ -485,12 +471,12 @@
    "id": "5e9e5817-3b9d-4a1e-8ee8-692d39aa68ca",
    "metadata": {},
    "source": [
-    "This evaluator will be used below when we benchmark on all tasks!"
+    "Each task is associated with its own task specific evaluator!"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
    "id": "c88bd6e1-f77e-4668-a143-096929e897ee",
    "metadata": {
     "tags": []
@@ -499,10 +485,10 @@
     {
      "data": {
       "text/plain": [
-       "RunEvalConfig(evaluators=[], custom_evaluators=[<langchain_benchmarks.tool_usage.evaluators.AgentTrajectoryEvaluator object at 0x15699ed10>], reference_key=None, prediction_key=None, input_key=None, eval_llm=None)"
+       "RunEvalConfig(evaluators=[], custom_evaluators=[<langchain_benchmarks.tool_usage.evaluators.AgentTrajectoryEvaluator object at 0x7b3a9ea5b110>], batch_evaluators=None, reference_key=None, prediction_key=None, input_key=None, eval_llm=None)"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -522,14 +508,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "id": "60466447-eb37-4204-a497-fe47e8d8dd70",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 11,
+   "id": "0770b442-f96a-4670-a4f7-3093f24fb64b",
+   "metadata": {},
    "outputs": [],
    "source": [
     "import datetime\n",
+    "import uuid\n",
     "\n",
     "from langsmith.client import Client\n",
     "\n",
@@ -539,110 +524,89 @@
     "    model_registry,\n",
     "    registry,\n",
     ")\n",
-    "from langchain_benchmarks.rate_limiting import RateLimiter\n",
-    "from langchain_benchmarks.tool_usage.agents import (\n",
-    "    AnthropicToolUserFactory,\n",
-    "    CustomAgentFactory,\n",
-    "    CustomRunnableAgentFactory,\n",
-    "    OpenAIAgentFactory,\n",
-    "    OpenAIAssistantFactory,\n",
-    ")"
+    "from langchain_benchmarks.rate_limiting import RateLimiter"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "c448d139-9923-4cf6-af49-cbf3dff46bdc",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "id": "15cbded4-5ab5-4b9b-9e88-77b24d3b750c",
+   "metadata": {},
    "source": [
-    "import uuid\n",
-    "\n",
-    "experiment_uuid = uuid.uuid4().hex[:]"
+    "Create an experiment ID. we'll use it to tag our runs, which we can later use to retrieve run data from LangSmith."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "id": "200df769-4dd9-453b-8500-219c1d5305f6",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 12,
+   "id": "c23208e3-01d1-4e83-9e4a-59544828f6f5",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "tests = [\n",
-    "    # 2-tuple of (architecture, model name)\n",
-    "    (\"openai_functions\", \"gpt-3.5-turbo-1106\"),  # Requires OpenAI Creds\n",
-    "    (\"openai_functions\", \"gpt-3.5-turbo-0613\"),\n",
-    "    (\"openai_functions\", \"gpt-4-1106-preview\"),\n",
-    "    (\"openai_functions\", \"gpt-4-0613\"),\n",
-    "    (\"openai_functions\", \"mistral-7b-instruct-v0.1\"),  # Requires AnyScale creds\n",
-    "    # Requires Anthropic Creds and Setting up Anthropics Tool Usage package.\n",
-    "    # (\n",
-    "    #     \"anthropic_tool_user\",\n",
-    "    #     \"claude-2.1\",\n",
-    "    # ),\n",
-    "]"
+    "experiment_id = uuid.uuid4().hex[:]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "83050cfc-f50f-4c63-8257-07e7688a54c4",
+   "metadata": {},
+   "source": [
+    "Run evaluation against all tasks."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5ddf7355-7db9-4adc-bc1e-f04c3d0ec57d",
-   "metadata": {
-    "tags": []
-   },
+   "id": "b2a3463b-1c9f-494b-bcbd-1dc1760ebf19",
+   "metadata": {},
    "outputs": [],
    "source": [
     "client = Client()  # Launch langsmith client for cloning datasets\n",
     "today = datetime.date.today().isoformat()\n",
-    "rate_limiter = RateLimiter(requests_per_second=2)\n",
     "\n",
-    "for task in registry:\n",
+    "# You can use an optional rate limiter to rate limit your requests!\n",
+    "rate_limiter = RateLimiter(requests_per_second=1)\n",
+    "\n",
+    "\n",
+    "# Set up 2-tuples of (model name, model instance)\n",
+    "# You can update this list with any model that supports tool calling.\n",
+    "# See list here: https://python.langchain.com/docs/integrations/chat/\n",
+    "tests = [\n",
+    "    (\n",
+    "        \"claude-3-haiku-20240307\",\n",
+    "        ChatAnthropic(model=\"claude-3-haiku-20240307\", temperature=0),\n",
+    "    )\n",
+    "]\n",
+    "\n",
+    "\n",
+    "for task in registry.tasks:\n",
     "    if task.type != \"ToolUsageTask\":\n",
     "        continue\n",
     "\n",
-    "    dataset_name = task.name\n",
+    "    dataset_name = task.name + f\" ({today})\"\n",
     "    clone_public_dataset(task.dataset_id, dataset_name=dataset_name)\n",
     "\n",
-    "    for arch, model in tests:\n",
+    "    for model_name, model in tests:\n",
     "        print()\n",
-    "        print(f\"Benchmarking {task.name} with model: {model} and arch: {arch}\")\n",
+    "        print(f\"Benchmarking {task.name} with model: {model_name}\")\n",
     "        eval_config = task.get_eval_config()\n",
     "\n",
-    "        if arch == \"openai_functions\":\n",
-    "            agent_factory = OpenAIAgentFactory(\n",
-    "                task, model=model, rate_limiter=rate_limiter\n",
-    "            )\n",
-    "        elif arch == \"custom_agent\":\n",
-    "            agent_factory = CustomAgentFactory(\n",
-    "                task, model=model, rate_limiter=rate_limiter\n",
-    "            )\n",
-    "        elif arch == \"custom_runnable_agent\":\n",
-    "            # For this, the model would have to be a runnable object\n",
-    "            agent_factory = CustomRunnableAgentFactory(task, agent=model)\n",
-    "        elif arch == \"anthropic_tool_user\":\n",
-    "            agent_factory = AnthropicToolUserFactory(task)\n",
-    "        else:\n",
-    "            raise ValueError()\n",
+    "        agent_factory = StandardAgentFactory(\n",
+    "            task, model, prompt, rate_limiter=rate_limiter\n",
+    "        )\n",
     "\n",
     "        client.run_on_dataset(\n",
     "            dataset_name=dataset_name,\n",
     "            llm_or_chain_factory=agent_factory,\n",
     "            evaluation=eval_config,\n",
     "            verbose=False,\n",
-    "            project_name=f\"{model}-{task.name}-{today}-{experiment_uuid}\",\n",
-    "            tags=[model],\n",
+    "            project_name=f\"{model_name}-{task.name}-{today}-{experiment_id}\",\n",
     "            concurrency_level=5,\n",
     "            project_metadata={\n",
-    "                \"model\": model,\n",
+    "                \"model\": model_name,\n",
     "                \"id\": experiment_uuid,\n",
     "                \"task\": task.name,\n",
     "                \"date\": today,\n",
     "                \"langchain_benchmarks_version\": __version__,\n",
-    "                \"arch\": arch,\n",
     "            },\n",
     "        )"
    ]
@@ -656,6 +620,8 @@
     "\n",
     "The following sections demonstrate slightly more \"advanced\" usage if you want to completely customize the agent runtime in a way that is compatible with our test runner.\n",
     "\n",
+    "We'll also apply an adapter to the agent which will will capture its inputs and outputs (e.g, add information the agent's environment at the end of the run) so that it we can evaluate it.\n",
+    "\n",
     "### Custom Agent Factory\n",
     "\n",
     "If you want even more configurability beyond what the `CustomRunnableAgentFactory` provides, you can create your owne `AgentFactory` using the following pattern.\n",
@@ -666,33 +632,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "id": "bca8ad69-9956-451c-b639-ea30c77d982f",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 16,
+   "id": "69351864-2e97-43df-81ae-5067cbf5e471",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "from langchain.agents import AgentType, initialize_agent\n",
-    "from langchain.chat_models import ChatOpenAI\n",
+    "from typing import Optional\n",
+    "\n",
+    "from langchain.agents import AgentExecutor, create_tool_calling_agent\n",
+    "from langchain_anthropic import ChatAnthropic\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
     "\n",
     "from langchain_benchmarks.schema import ExtractionTask\n",
-    "from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "44839ebe-48ea-4d5b-87b4-2ad72acacb71",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "class AgentFactory:\n",
-    "    def __init__(self, task: ExtractionTask, model: str) -> None:\n",
+    "from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter\n",
+    "\n",
+    "\n",
+    "class CustomAgentFactory:\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        task: ExtractionTask,\n",
+    "        *,\n",
+    "        # It can be useful to add a rate-limiter\n",
+    "        # which will limit ther number of requests per second\n",
+    "        # when running evaluation.\n",
+    "        rate_limiter: Optional[RateLimiter] = None,\n",
+    "    ) -> None:\n",
     "        self.task = task\n",
-    "        self.model = model\n",
+    "        self.rate_limiter = rate_limiter\n",
     "\n",
     "    def __call__(self):\n",
     "        # This factory creates a new environment for every agent run.\n",
@@ -701,63 +667,127 @@
     "        # At the end of the run, the environment state will be read.\n",
     "        env = task.create_environment()  # Create a new environment for every agent run!\n",
     "        tools = env.tools\n",
-    "        llm = ChatOpenAI(temperature=0, model=self.model)\n",
-    "        agent_executor = initialize_agent(\n",
-    "            tools,\n",
-    "            llm,\n",
-    "            agent=AgentType.OPENAI_FUNCTIONS,\n",
-    "            return_intermediate_steps=True,\n",
+    "        model = ChatAnthropic(model=\"claude-3-opus-20240229\", temperature=0)\n",
+    "        prompt = ChatPromptTemplate.from_messages(\n",
+    "            [\n",
+    "                (\"system\", self.task.instructions),\n",
+    "                (\n",
+    "                    \"human\",\n",
+    "                    \"{question}\",\n",
+    "                ),  # Populated from task.instructions automatically\n",
+    "                (\"placeholder\", \"{agent_scratchpad}\"),\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "        # This is the standard tool calling agent implementation\n",
+    "        # Feel free to replace it with any other implementation you want!\n",
+    "        # https://python.langchain.com/docs/modules/agents/how_to/custom_agent/\n",
+    "        agent = create_tool_calling_agent(model, env.tools, prompt)\n",
+    "\n",
+    "        if self.rate_limiter:\n",
+    "            agent = with_rate_limit(agent, self.rate_limiter)\n",
+    "\n",
+    "        executor = AgentExecutor(\n",
+    "            agent=agent,\n",
+    "            tools=env.tools,\n",
     "            handle_parsing_errors=True,\n",
+    "            return_intermediate_steps=True,\n",
     "        )\n",
+    "\n",
     "        # Apply the adapters so that inputs and outputs match dataset schema\n",
     "        # state_reader automatically adds the state of the environment at the end of the run.\n",
-    "        return apply_agent_executor_adapter(agent_executor, state_reader=env.read_state)"
+    "        return apply_agent_executor_adapter(executor, state_reader=env.read_state)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "id": "8b6108e4-c7cc-42e8-a23d-89c7b94fab6c",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 17,
+   "id": "18a96a6f-812b-4b0e-83c5-d001bf50851e",
+   "metadata": {},
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<table>\n",
+       "<tbody>\n",
+       "<tr><td>Name       </td><td>Tool Usage - Typewriter (26 tools)                                                                                                                         </td></tr>\n",
+       "<tr><td>Type       </td><td>ToolUsageTask                                                                                                                                              </td></tr>\n",
+       "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d\" target=\"_blank\" rel=\"noopener\">128af05e-aa00-4e3b-a958-d166dd450581</a></td></tr>\n",
+       "<tr><td>Description</td><td>Environment with 26 tools each tool represents a letter of the alphabet.\n",
+       "\n",
+       "The objective of this task is to evaluate the model's ability the use tools\n",
+       "for a simple repetition task.\n",
+       "\n",
+       "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n",
+       "\n",
+       "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n",
+       "\n",
+       "This is a variation of the typer writer task, where 26 parameterless tools are\n",
+       "given instead of a single tool that takes a letter as an argument.                                                                                                                                                            </td></tr>\n",
+       "</tbody>\n",
+       "</table>"
+      ],
       "text/plain": [
-       "{'input': 'xypxy',\n",
-       " 'output': 'I have typed \"xypxy\" as you requested.',\n",
-       " 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'x'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'x'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"x\"\\n}', 'name': 'type_letter'}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'y'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'y'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"y\"\\n}', 'name': 'type_letter'}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'p'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'p'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"p\"\\n}', 'name': 'type_letter'}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'x'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'x'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"x\"\\n}', 'name': 'type_letter'}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'y'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'y'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"y\"\\n}', 'name': 'type_letter'}})]),\n",
-       "   'OK')],\n",
-       " 'state': 'xypxy'}"
+       "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x78972c6c3060>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "agent_factory = AgentFactory(task, \"gpt-4\")\n",
-    "agent = agent_factory()\n",
-    "agent.invoke({\"question\": \"xypxy\"})"
+    "task"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "9bdf9328-0103-48d3-8dfc-933423db9796",
+   "execution_count": 18,
+   "id": "a7bd4af3-c0f1-4308-abbf-330d7497b3e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_agent_factory = CustomAgentFactory(task)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "c5b69b7c-4294-47d1-85d7-47d718945898",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "agent = custom_agent_factory()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "1ac24ef5-d3ca-41aa-b888-7ebcd8a92ff4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'abc',\n",
+       " 'output': [],\n",
+       " 'intermediate_steps': [(ToolAgentAction(tool='a', tool_input={}, log='\\nInvoking: `a` with `{}`\\nresponded: [{\\'text\\': \\'<thinking>\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n</thinking>\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_016f6CZwwFmdz2h8KbdGRVjj'),\n",
+       "   'OK'),\n",
+       "  (ToolAgentAction(tool='b', tool_input={}, log='\\nInvoking: `b` with `{}`\\nresponded: [{\\'text\\': \\'<thinking>\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n</thinking>\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_01JvfeTpU3hEuS7PknFk5a8S'),\n",
+       "   'OK'),\n",
+       "  (ToolAgentAction(tool='c', tool_input={}, log='\\nInvoking: `c` with `{}`\\nresponded: [{\\'text\\': \\'<thinking>\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n</thinking>\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_01NbBCY5Fg62RsyAAUd4n2g1'),\n",
+       "   'OK')],\n",
+       " 'state': 'abc'}"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "agent.invoke({\"question\": \"abc\"})"
+   ]
   }
  ],
  "metadata": {
@@ -776,7 +806,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.2"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/docs/source/notebooks/tool_usage/multiverse_math.ipynb b/docs/source/notebooks/tool_usage/multiverse_math.ipynb
index 44eb1b4..00b8cb7 100644
--- a/docs/source/notebooks/tool_usage/multiverse_math.ipynb
+++ b/docs/source/notebooks/tool_usage/multiverse_math.ipynb
@@ -40,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
    "metadata": {
     "tags": []
@@ -52,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece",
    "metadata": {
     "tags": []
@@ -65,20 +65,22 @@
        "<tbody>\n",
        "<tr><td>Name       </td><td>Multiverse Math                                                                                                                                            </td></tr>\n",
        "<tr><td>Type       </td><td>ToolUsageTask                                                                                                                                              </td></tr>\n",
-       "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d\" target=\"_blank\" rel=\"noopener\">594f9f60-30a0-49bf-b075-f44beabf546a</a></td></tr>\n",
+       "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d\" target=\"_blank\" rel=\"noopener\">47ed57bc-e852-4f84-a23e-cce4793864e9</a></td></tr>\n",
        "<tr><td>Description</td><td>An environment that contains a few basic math operations, but with altered results.\n",
        "\n",
        "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n",
        "\n",
-       "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.                                                                                                                                                            </td></tr>\n",
+       "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\n",
+       "\n",
+       "This task is associated with 20 test examples.                                                                                                                                                            </td></tr>\n",
        "</tbody>\n",
        "</table>"
       ],
       "text/plain": [
-       "ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x7f94df105a80>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math'})"
+       "ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n\\nThis task is associated with 20 test examples.\\n', create_environment=<function get_environment at 0x721642cba020>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math_without_question'})"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -108,7 +110,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "id": "e2439d0c-ccb9-4f5b-a127-548725025a98",
    "metadata": {
     "tags": []
@@ -117,14 +119,14 @@
     {
      "data": {
       "text/plain": [
-       "[StructuredTool(name='multiply', description='multiply(a: float, b: float) -> float - Multiply two numbers; a * b.', args_schema=<class 'pydantic.v1.main.multiplySchemaSchema'>, func=<function multiply at 0x7f94e0148a40>),\n",
-       " StructuredTool(name='add', description='add(a: float, b: float) -> float - Add two numbers; a + b.', args_schema=<class 'pydantic.v1.main.addSchemaSchema'>, func=<function add at 0x7f94df105580>),\n",
-       " StructuredTool(name='divide', description='divide(a: float, b: float) -> float - Divide two numbers; a / b.', args_schema=<class 'pydantic.v1.main.divideSchemaSchema'>, func=<function divide at 0x7f94df104540>),\n",
-       " StructuredTool(name='subtract', description='subtract(a: float, b: float) -> float - Subtract two numbers; a - b.', args_schema=<class 'pydantic.v1.main.subtractSchemaSchema'>, func=<function subtract at 0x7f94df105760>),\n",
-       " StructuredTool(name='power', description='power(a: float, b: float) -> float - Raise a number to a power; a ** b.', args_schema=<class 'pydantic.v1.main.powerSchemaSchema'>, func=<function power at 0x7f94df105800>)]"
+       "[StructuredTool(name='multiply', description='multiply(a: float, b: float) -> float - Multiply two numbers; a * b.', args_schema=<class 'pydantic.v1.main.multiplySchema'>, func=<function multiply at 0x7216a3a78220>),\n",
+       " StructuredTool(name='add', description='add(a: float, b: float) -> float - Add two numbers; a + b.', args_schema=<class 'pydantic.v1.main.addSchema'>, func=<function add at 0x721642cb9b20>),\n",
+       " StructuredTool(name='divide', description='divide(a: float, b: float) -> float - Divide two numbers; a / b.', args_schema=<class 'pydantic.v1.main.divideSchema'>, func=<function divide at 0x72167803be20>),\n",
+       " StructuredTool(name='subtract', description='subtract(a: float, b: float) -> float - Subtract two numbers; a - b.', args_schema=<class 'pydantic.v1.main.subtractSchema'>, func=<function subtract at 0x721642cb9d00>),\n",
+       " StructuredTool(name='power', description='power(a: float, b: float) -> float - Raise a number to a power; a ** b.', args_schema=<class 'pydantic.v1.main.powerSchema'>, func=<function power at 0x721642cb9da0>)]"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -144,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "id": "f5a100bd-6e19-498f-8a36-393b5c19bcb9",
    "metadata": {
     "tags": []
@@ -156,7 +158,7 @@
        "8.8"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -175,7 +177,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "id": "31afb08b-17b8-4866-86c1-ee24e804415c",
    "metadata": {
     "tags": []
@@ -187,7 +189,7 @@
        "'You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.'"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -210,1101 +212,86 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "id": "6142cf4e-862c-47a3-aa75-81d7d3231308",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'input': 'how much is 3 + 5',\n",
-       " 'output': '9.2',\n",
-       " 'intermediate_steps': [(AgentActionMessageLog(tool='add', tool_input={'a': 3, 'b': 5}, log=\"\\nInvoking: `add` with `{'a': 3, 'b': 5}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"a\": 3,\\n  \"b\": 5\\n}', 'name': 'add'}})]),\n",
-       "   9.2)]}"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "from langchain_benchmarks.tool_usage import agents\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_openai.chat_models import ChatOpenAI\n",
     "\n",
-    "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-4-0613\")\n",
+    "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n",
     "\n",
-    "# Let's test that our agent works\n",
-    "agent = agent_factory.create()\n",
-    "agent.invoke({\"question\": \"how much is 3 + 5\"})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
-   "metadata": {},
-   "source": [
-    "## Eval\n",
+    "model = ChatOpenAI(temperature=0)\n",
+    "prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\"system\", \"{instructions}\"),  # Populated from task.instructions automatically\n",
+    "        (\"human\", \"{question}\"),  # Populated from the test data\n",
+    "        (\n",
+    "            \"placeholder\",\n",
+    "            \"{agent_scratchpad}\",\n",
+    "        ),  # Work where the agent can do its work (e.g., call multiple tools)\n",
+    "    ]\n",
+    ")\n",
     "\n",
-    "Let's evaluate an agent now"
+    "agent_factory = StandardAgentFactory(task, model, prompt)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 7,
+   "id": "11e4fff5-e184-45e1-a472-c0a9f70e897a",
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "\n",
-      "View the evaluation results for project 'multiverse-math-gpt-3.5-turbo-1106-d680' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/0919eab8-dca7-4049-a6eb-4067b9862eba?eval=true\n",
       "\n",
-      "View all tests for Dataset Multiverse Math at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n",
-      "[------------------------------------------------->] 10/10\n",
-      "View the evaluation results for project 'multiverse-math-gpt-3.5-turbo-0613-d680' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/126d8555-c69c-4ab7-ae95-2ad9bc41989e?eval=true\n",
+      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
+      "\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `add` with `{'a': 2, 'b': 5}`\n",
+      "\n",
       "\n",
-      "View all tests for Dataset Multiverse Math at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n",
-      "[------------------------------------------------->] 10/10\n",
-      "View the evaluation results for project 'multiverse-math-gpt-4-0613-d680' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/619eb2de-8920-41db-91a1-15ee407422de?eval=true\n",
+      "\u001b[0m\u001b[33;1m\u001b[1;3m8.2\u001b[0m\u001b[32;1m\u001b[1;3mThe result of 2 + 5 in this alternate mathematical universe is 8.2.\u001b[0m\n",
       "\n",
-      "View all tests for Dataset Multiverse Math at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n",
-      "[------------------------------------------------->] 10/10"
+      "\u001b[1m> Finished chain.\u001b[0m\n"
      ]
-    }
-   ],
-   "source": [
-    "import uuid\n",
-    "\n",
-    "from langsmith.client import Client\n",
-    "\n",
-    "from langchain_benchmarks import clone_public_dataset\n",
-    "\n",
-    "clone_public_dataset(task.dataset_id, dataset_name=task.name)\n",
-    "experiment_uuid = uuid.uuid4().hex[:4]\n",
-    "\n",
-    "client = Client()\n",
-    "\n",
-    "models = [\"gpt-3.5-turbo-1106\", \"gpt-3.5-turbo-0613\", \"gpt-4-0613\"]\n",
-    "\n",
-    "for model in models:\n",
-    "    print()\n",
-    "    agent_factory = agents.OpenAIAgentFactory(task, model=model)\n",
-    "    test_run = client.run_on_dataset(\n",
-    "        dataset_name=task.name,\n",
-    "        llm_or_chain_factory=agent_factory,\n",
-    "        evaluation=task.get_eval_config(),\n",
-    "        verbose=False,\n",
-    "        project_name=f\"multiverse-math-{model}-{experiment_uuid}\",\n",
-    "        tags=[model],\n",
-    "        project_metadata={\n",
-    "            \"model\": model,\n",
-    "            \"arch\": \"openai-functions-agent\",\n",
-    "            \"id\": experiment_uuid,\n",
-    "        },\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
-   "metadata": {},
-   "source": [
-    "## Analyze\n",
-    "\n",
-    "You can take a look at the underlying results."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "78c4cc84-43c2-4084-a63b-dc10a5c01856",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from langsmith.client import Client\n",
-    "\n",
-    "client = Client()\n",
-    "projects = list(client.list_projects(reference_dataset_name=\"Multiverse Math\"))\n",
-    "\n",
-    "dfs = []\n",
-    "for project in projects:\n",
-    "    first_root_run = next(\n",
-    "        client.list_runs(project_name=project.name, execution_order=1)\n",
-    "    )\n",
-    "    # Temporary way to get tag information\n",
-    "    tags = first_root_run.tags\n",
-    "    test_results = client.get_test_results(project_name=project.name)\n",
-    "    test_results[\"model\"] = tags[0]\n",
-    "    dfs.append(test_results)\n",
-    "\n",
-    "\n",
-    "df = pd.concat(dfs)\n",
-    "\n",
-    "df[\"actual_steps\"] = df[\"outputs.intermediate_steps\"].apply(\n",
-    "    lambda steps: [step[0][\"tool\"] for step in steps]\n",
-    ")\n",
-    "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0ab0792e-e04a-400e-9726-5c123836f710",
-   "metadata": {},
-   "source": [
-    "### Stats\n",
-    "\n",
-    "This is a really small dataset so it's hard to tell whether there are substantial differences between the models; however, the agents are clearly not perfect here.\n",
-    "\n",
-    "The results are suggestive of the fact that it's more difficult for gpt-4 to ignore what it knows about math (which isn't surprising); e.g., in this universe the negative of -5 is still -5 (rather than 5).\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "53d85491-a5a0-4448-bccc-7171e03ffb21",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
+    },
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>feedback.correctness</th>\n",
-       "      <th>feedback.Intermediate steps correctness</th>\n",
-       "      <th>execution_time</th>\n",
-       "      <th>feedback.# steps / # expected steps</th>\n",
-       "      <th># correct</th>\n",
-       "      <th>n</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>model</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>gpt-3.5-turbo-0613</th>\n",
-       "      <td>0.8</td>\n",
-       "      <td>0.8</td>\n",
-       "      <td>7.992928</td>\n",
-       "      <td>1.03333</td>\n",
-       "      <td>8.0</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>gpt-3.5-turbo-1106</th>\n",
-       "      <td>0.6</td>\n",
-       "      <td>0.6</td>\n",
-       "      <td>8.933172</td>\n",
-       "      <td>0.93332</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>gpt-4-0613</th>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.6</td>\n",
-       "      <td>8.329558</td>\n",
-       "      <td>0.76666</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "                    feedback.correctness  \\\n",
-       "model                                      \n",
-       "gpt-3.5-turbo-0613                   0.8   \n",
-       "gpt-3.5-turbo-1106                   0.6   \n",
-       "gpt-4-0613                           0.5   \n",
-       "\n",
-       "                    feedback.Intermediate steps correctness  execution_time  \\\n",
-       "model                                                                         \n",
-       "gpt-3.5-turbo-0613                                      0.8        7.992928   \n",
-       "gpt-3.5-turbo-1106                                      0.6        8.933172   \n",
-       "gpt-4-0613                                              0.6        8.329558   \n",
-       "\n",
-       "                    feedback.# steps / # expected steps  # correct   n  \n",
-       "model                                                                   \n",
-       "gpt-3.5-turbo-0613                              1.03333        8.0  10  \n",
-       "gpt-3.5-turbo-1106                              0.93332        6.0  10  \n",
-       "gpt-4-0613                                      0.76666        5.0  10  "
+       "{'question': 'how much is 2+5',\n",
+       " 'output': 'The result of 2 + 5 in this alternate mathematical universe is 8.2.',\n",
+       " 'intermediate_steps': [(ToolAgentAction(tool='add', tool_input={'a': 2, 'b': 5}, log=\"\\nInvoking: `add` with `{'a': 2, 'b': 5}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_MZMnEZrae7AuXYtWzH0l9xKL', 'function': {'arguments': '{\"a\":2,\"b\":5}', 'name': 'add'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-b7548303-194d-40ee-85bf-3d43cac39526', tool_calls=[{'name': 'add', 'args': {'a': 2, 'b': 5}, 'id': 'call_MZMnEZrae7AuXYtWzH0l9xKL'}], tool_call_chunks=[{'name': 'add', 'args': '{\"a\":2,\"b\":5}', 'id': 'call_MZMnEZrae7AuXYtWzH0l9xKL', 'index': 0}])], tool_call_id='call_MZMnEZrae7AuXYtWzH0l9xKL'),\n",
+       "   8.2)]}"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "correct_df = df.groupby(\"model\")[\"feedback.correctness\"].sum().to_frame(\"# correct\")\n",
-    "count_df = df.groupby(\"model\").size().to_frame(\"n\")\n",
+    "from langchain import globals\n",
     "\n",
-    "columns = [\n",
-    "    \"feedback.correctness\",\n",
-    "    \"feedback.Intermediate steps correctness\",\n",
-    "    \"execution_time\",\n",
-    "    \"feedback.# steps / # expected steps\",\n",
-    "]\n",
+    "globals.set_verbose(True)\n",
     "\n",
-    "df.groupby(\"model\")[columns].mean().join(correct_df).join(count_df)"
+    "agent = agent_factory()\n",
+    "agent.invoke({\"question\": \"how much is 2+5\"})"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "c3ac1946-a7cb-4cd2-8de1-d61c46966d06",
+   "id": "b29a915c-1041-4108-a234-a877b6f59de4",
    "metadata": {},
    "source": [
-    "### Individual"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "1798f587-38a1-439e-8c1e-f9eeb3a23c8d",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>input.question</th>\n",
-       "      <th>model</th>\n",
-       "      <th>actual_steps</th>\n",
-       "      <th>reference.expected_steps</th>\n",
-       "      <th>outputs.output</th>\n",
-       "      <th>reference.reference</th>\n",
-       "      <th>feedback.correctness</th>\n",
-       "      <th>num_expected_steps</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>example_id</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
-       "      <td>Add 2 and 3</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>The sum of 2 and 3 in this alternate mathemati...</td>\n",
-       "      <td>6.20</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
-       "      <td>Add 2 and 3</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>The result of adding 2 and 3 is 6.2.</td>\n",
-       "      <td>6.20</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
-       "      <td>Add 2 and 3</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>6.2</td>\n",
-       "      <td>6.20</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2d3e1665-7b3f-4013-b010-6af30ed62ab2</th>\n",
-       "      <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[add, multiply]</td>\n",
-       "      <td>[add, multiply]</td>\n",
-       "      <td>You ate a total of 32.34 fruits.</td>\n",
-       "      <td>32.34</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2d3e1665-7b3f-4013-b010-6af30ed62ab2</th>\n",
-       "      <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>[add, multiply]</td>\n",
-       "      <td>You ate 16.2 fruits.</td>\n",
-       "      <td>32.34</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                                         input.question  \\\n",
-       "example_id                                                                                \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                                        Add 2 and 3   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                                        Add 2 and 3   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                                        Add 2 and 3   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  I ate 1 apple and 2 oranges every day for 7 da...   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  I ate 1 apple and 2 oranges every day for 7 da...   \n",
-       "\n",
-       "                                                   model     actual_steps  \\\n",
-       "example_id                                                                  \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599  gpt-3.5-turbo-0613            [add]   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599  gpt-3.5-turbo-1106            [add]   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599          gpt-4-0613            [add]   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  gpt-3.5-turbo-0613  [add, multiply]   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  gpt-3.5-turbo-1106            [add]   \n",
-       "\n",
-       "                                     reference.expected_steps  \\\n",
-       "example_id                                                      \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                    [add]   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                    [add]   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                    [add]   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2          [add, multiply]   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2          [add, multiply]   \n",
-       "\n",
-       "                                                                         outputs.output  \\\n",
-       "example_id                                                                                \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599  The sum of 2 and 3 in this alternate mathemati...   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599               The result of adding 2 and 3 is 6.2.   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                                                6.2   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                   You ate a total of 32.34 fruits.   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                               You ate 16.2 fruits.   \n",
-       "\n",
-       "                                      reference.reference  \\\n",
-       "example_id                                                  \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                 6.20   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                 6.20   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                 6.20   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                32.34   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                32.34   \n",
-       "\n",
-       "                                      feedback.correctness  num_expected_steps  \n",
-       "example_id                                                                      \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                   1.0                   1  \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                   1.0                   1  \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                   1.0                   1  \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                   1.0                   2  \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                   0.0                   2  "
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "columns = [\n",
-    "    \"input.question\",\n",
-    "    \"model\",\n",
-    "    \"actual_steps\",\n",
-    "    \"reference.expected_steps\",\n",
-    "    \"outputs.output\",\n",
-    "    \"reference.reference\",\n",
-    "    \"feedback.correctness\",\n",
-    "    \"num_expected_steps\",\n",
-    "]\n",
-    "df[columns].sort_values(by=[\"input.question\", \"model\"]).head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "016fbe05-a993-492c-95db-69d3ba756495",
-   "metadata": {
-    "tags": [
-     "remove-cell"
-    ]
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>input.question</th>\n",
-       "      <th>model</th>\n",
-       "      <th>actual_steps</th>\n",
-       "      <th>reference.expected_steps</th>\n",
-       "      <th>outputs.output</th>\n",
-       "      <th>reference.reference</th>\n",
-       "      <th>feedback.correctness</th>\n",
-       "      <th>num_expected_steps</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>example_id</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
-       "      <td>Add 2 and 3</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>The sum of 2 and 3 in this alternate mathemati...</td>\n",
-       "      <td>6.200000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
-       "      <td>Add 2 and 3</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>The result of adding 2 and 3 is 6.2.</td>\n",
-       "      <td>6.200000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
-       "      <td>Add 2 and 3</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>6.2</td>\n",
-       "      <td>6.200000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2d3e1665-7b3f-4013-b010-6af30ed62ab2</th>\n",
-       "      <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[add, multiply]</td>\n",
-       "      <td>[add, multiply]</td>\n",
-       "      <td>You ate a total of 32.34 fruits.</td>\n",
-       "      <td>32.340000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2d3e1665-7b3f-4013-b010-6af30ed62ab2</th>\n",
-       "      <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[add]</td>\n",
-       "      <td>[add, multiply]</td>\n",
-       "      <td>You ate 16.2 fruits.</td>\n",
-       "      <td>32.340000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2d3e1665-7b3f-4013-b010-6af30ed62ab2</th>\n",
-       "      <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>[add, multiply]</td>\n",
-       "      <td>[add, multiply]</td>\n",
-       "      <td>32.34</td>\n",
-       "      <td>32.340000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c857031a-6ab1-4b06-9638-3a8a4ba69f11</th>\n",
-       "      <td>Subtract 3 from 2</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[subtract]</td>\n",
-       "      <td>[subtract]</td>\n",
-       "      <td>The result of subtracting 3 from 2 in this alt...</td>\n",
-       "      <td>-4.000000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c857031a-6ab1-4b06-9638-3a8a4ba69f11</th>\n",
-       "      <td>Subtract 3 from 2</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[subtract]</td>\n",
-       "      <td>[subtract]</td>\n",
-       "      <td>The result of subtracting 3 from 2 is -4.</td>\n",
-       "      <td>-4.000000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c857031a-6ab1-4b06-9638-3a8a4ba69f11</th>\n",
-       "      <td>Subtract 3 from 2</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>[subtract]</td>\n",
-       "      <td>[subtract]</td>\n",
-       "      <td>-4.0</td>\n",
-       "      <td>-4.000000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>75db51d4-5c3b-4312-9eb9-b40c74eafdcd</th>\n",
-       "      <td>What is -5 if evaluated using the negate funct...</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[negate]</td>\n",
-       "      <td>[negate]</td>\n",
-       "      <td>The result of evaluating -5 using the negate f...</td>\n",
-       "      <td>-5.000000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>75db51d4-5c3b-4312-9eb9-b40c74eafdcd</th>\n",
-       "      <td>What is -5 if evaluated using the negate funct...</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[negate]</td>\n",
-       "      <td>[negate]</td>\n",
-       "      <td>The result of evaluating -5 using the negate f...</td>\n",
-       "      <td>-5.000000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>75db51d4-5c3b-4312-9eb9-b40c74eafdcd</th>\n",
-       "      <td>What is -5 if evaluated using the negate funct...</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>[negate]</td>\n",
-       "      <td>[negate]</td>\n",
-       "      <td>5</td>\n",
-       "      <td>-5.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2a20a13d-050e-4a16-84ff-22d9582f1449</th>\n",
-       "      <td>after calculating the sin of 1.5 radians, divi...</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[sin, cos, divide]</td>\n",
-       "      <td>[sin, cos, divide]</td>\n",
-       "      <td>The result of dividing the sine of 1.5 radians...</td>\n",
-       "      <td>0.035457</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2a20a13d-050e-4a16-84ff-22d9582f1449</th>\n",
-       "      <td>after calculating the sin of 1.5 radians, divi...</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[sin, cos, divide]</td>\n",
-       "      <td>[sin, cos, divide]</td>\n",
-       "      <td>The result is 0.035457422151326225.</td>\n",
-       "      <td>0.035457</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2a20a13d-050e-4a16-84ff-22d9582f1449</th>\n",
-       "      <td>after calculating the sin of 1.5 radians, divi...</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>[sin, cos, divide]</td>\n",
-       "      <td>[sin, cos, divide]</td>\n",
-       "      <td>0.035457422151326225</td>\n",
-       "      <td>0.035457</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4ac33c1a-62f0-4da4-9455-07b582f6ff52</th>\n",
-       "      <td>calculate 101 to the power of 0.5 to 4 digits ...</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[power, power, power, power]</td>\n",
-       "      <td>[power, round]</td>\n",
-       "      <td>The result of 101 to the power of 0.5 to 4 dig...</td>\n",
-       "      <td>102518.781200</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4ac33c1a-62f0-4da4-9455-07b582f6ff52</th>\n",
-       "      <td>calculate 101 to the power of 0.5 to 4 digits ...</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[power, power, power]</td>\n",
-       "      <td>[power, round]</td>\n",
-       "      <td>3.8109e+37</td>\n",
-       "      <td>102518.781200</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4ac33c1a-62f0-4da4-9455-07b582f6ff52</th>\n",
-       "      <td>calculate 101 to the power of 0.5 to 4 digits ...</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>[power]</td>\n",
-       "      <td>[power, round]</td>\n",
-       "      <td>102519</td>\n",
-       "      <td>102518.781200</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2e82a924-8382-425e-8738-daa2d912e9fe</th>\n",
-       "      <td>convert 15 degrees to radians</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[divide]</td>\n",
-       "      <td>[pi, multiply, divide]</td>\n",
-       "      <td>15 degrees is approximately 0.0417 radians.</td>\n",
-       "      <td>0.124588</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2e82a924-8382-425e-8738-daa2d912e9fe</th>\n",
-       "      <td>convert 15 degrees to radians</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[pi, divide]</td>\n",
-       "      <td>[pi, multiply, divide]</td>\n",
-       "      <td>15 degrees is approximately 0.0417 radians.</td>\n",
-       "      <td>0.124588</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2e82a924-8382-425e-8738-daa2d912e9fe</th>\n",
-       "      <td>convert 15 degrees to radians</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>[multiply]</td>\n",
-       "      <td>[pi, multiply, divide]</td>\n",
-       "      <td>0.28797945</td>\n",
-       "      <td>0.124588</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>67867526-791a-452f-b534-ef2c1f5efd20</th>\n",
-       "      <td>ecoli divides every 20 minutes. How many cells...</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[divide, power, multiply]</td>\n",
-       "      <td>[divide, power, multiply]</td>\n",
-       "      <td>After 2 hours, starting with 5 cells, there wi...</td>\n",
-       "      <td>176.000000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>67867526-791a-452f-b534-ef2c1f5efd20</th>\n",
-       "      <td>ecoli divides every 20 minutes. How many cells...</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[divide, power]</td>\n",
-       "      <td>[divide, power, multiply]</td>\n",
-       "      <td>After 2 hours, there will be 2187 cells.</td>\n",
-       "      <td>176.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>67867526-791a-452f-b534-ef2c1f5efd20</th>\n",
-       "      <td>ecoli divides every 20 minutes. How many cells...</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>[multiply]</td>\n",
-       "      <td>[divide, power, multiply]</td>\n",
-       "      <td>352.0</td>\n",
-       "      <td>176.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>27c44572-6c67-4129-a95a-fe1509c350be</th>\n",
-       "      <td>multiply the result of (log of 100 to base 10)...</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[log, multiply]</td>\n",
-       "      <td>[log, multiply]</td>\n",
-       "      <td>The result of multiplying the logarithm of 100...</td>\n",
-       "      <td>6.222319</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>27c44572-6c67-4129-a95a-fe1509c350be</th>\n",
-       "      <td>multiply the result of (log of 100 to base 10)...</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[log, multiply]</td>\n",
-       "      <td>[log, multiply]</td>\n",
-       "      <td>The result is 6.222318693323366</td>\n",
-       "      <td>6.222319</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>27c44572-6c67-4129-a95a-fe1509c350be</th>\n",
-       "      <td>multiply the result of (log of 100 to base 10)...</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>[multiply]</td>\n",
-       "      <td>[log, multiply]</td>\n",
-       "      <td>19.8</td>\n",
-       "      <td>6.222319</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>dd079541-c0da-4d94-85b7-50f0516a9ca1</th>\n",
-       "      <td>what is the result of 2 to the power of 3?</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>[power]</td>\n",
-       "      <td>[power]</td>\n",
-       "      <td>The result of 2 to the power of 3 is 32.</td>\n",
-       "      <td>32.000000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>dd079541-c0da-4d94-85b7-50f0516a9ca1</th>\n",
-       "      <td>what is the result of 2 to the power of 3?</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>[power]</td>\n",
-       "      <td>[power]</td>\n",
-       "      <td>The result of 2 to the power of 3 is 32.</td>\n",
-       "      <td>32.000000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>dd079541-c0da-4d94-85b7-50f0516a9ca1</th>\n",
-       "      <td>what is the result of 2 to the power of 3?</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>[power]</td>\n",
-       "      <td>[power]</td>\n",
-       "      <td>32.0</td>\n",
-       "      <td>32.000000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                                         input.question  \\\n",
-       "example_id                                                                                \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                                        Add 2 and 3   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                                        Add 2 and 3   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                                        Add 2 and 3   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  I ate 1 apple and 2 oranges every day for 7 da...   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  I ate 1 apple and 2 oranges every day for 7 da...   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  I ate 1 apple and 2 oranges every day for 7 da...   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                                  Subtract 3 from 2   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                                  Subtract 3 from 2   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                                  Subtract 3 from 2   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd  What is -5 if evaluated using the negate funct...   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd  What is -5 if evaluated using the negate funct...   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd  What is -5 if evaluated using the negate funct...   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449  after calculating the sin of 1.5 radians, divi...   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449  after calculating the sin of 1.5 radians, divi...   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449  after calculating the sin of 1.5 radians, divi...   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52  calculate 101 to the power of 0.5 to 4 digits ...   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52  calculate 101 to the power of 0.5 to 4 digits ...   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52  calculate 101 to the power of 0.5 to 4 digits ...   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe                      convert 15 degrees to radians   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe                      convert 15 degrees to radians   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe                      convert 15 degrees to radians   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20  ecoli divides every 20 minutes. How many cells...   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20  ecoli divides every 20 minutes. How many cells...   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20  ecoli divides every 20 minutes. How many cells...   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be  multiply the result of (log of 100 to base 10)...   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be  multiply the result of (log of 100 to base 10)...   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be  multiply the result of (log of 100 to base 10)...   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1         what is the result of 2 to the power of 3?   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1         what is the result of 2 to the power of 3?   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1         what is the result of 2 to the power of 3?   \n",
-       "\n",
-       "                                                   model  \\\n",
-       "example_id                                                 \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599  gpt-3.5-turbo-0613   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599  gpt-3.5-turbo-1106   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599          gpt-4-0613   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  gpt-3.5-turbo-0613   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  gpt-3.5-turbo-1106   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2          gpt-4-0613   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11  gpt-3.5-turbo-0613   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11  gpt-3.5-turbo-1106   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11          gpt-4-0613   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd  gpt-3.5-turbo-0613   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd  gpt-3.5-turbo-1106   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd          gpt-4-0613   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449  gpt-3.5-turbo-0613   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449  gpt-3.5-turbo-1106   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449          gpt-4-0613   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52  gpt-3.5-turbo-0613   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52  gpt-3.5-turbo-1106   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52          gpt-4-0613   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe  gpt-3.5-turbo-0613   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe  gpt-3.5-turbo-1106   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe          gpt-4-0613   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20  gpt-3.5-turbo-0613   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20  gpt-3.5-turbo-1106   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20          gpt-4-0613   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be  gpt-3.5-turbo-0613   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be  gpt-3.5-turbo-1106   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be          gpt-4-0613   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1  gpt-3.5-turbo-0613   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1  gpt-3.5-turbo-1106   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1          gpt-4-0613   \n",
-       "\n",
-       "                                                      actual_steps  \\\n",
-       "example_id                                                           \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                         [add]   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                         [add]   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                         [add]   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2               [add, multiply]   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                         [add]   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2               [add, multiply]   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                    [subtract]   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                    [subtract]   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                    [subtract]   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd                      [negate]   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd                      [negate]   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd                      [negate]   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449            [sin, cos, divide]   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449            [sin, cos, divide]   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449            [sin, cos, divide]   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52  [power, power, power, power]   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52         [power, power, power]   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52                       [power]   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe                      [divide]   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe                  [pi, divide]   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe                    [multiply]   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20     [divide, power, multiply]   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20               [divide, power]   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20                    [multiply]   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be               [log, multiply]   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be               [log, multiply]   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be                    [multiply]   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1                       [power]   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1                       [power]   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1                       [power]   \n",
-       "\n",
-       "                                       reference.expected_steps  \\\n",
-       "example_id                                                        \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                      [add]   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                      [add]   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                      [add]   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2            [add, multiply]   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2            [add, multiply]   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2            [add, multiply]   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                 [subtract]   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                 [subtract]   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                 [subtract]   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd                   [negate]   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd                   [negate]   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd                   [negate]   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449         [sin, cos, divide]   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449         [sin, cos, divide]   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449         [sin, cos, divide]   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52             [power, round]   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52             [power, round]   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52             [power, round]   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe     [pi, multiply, divide]   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe     [pi, multiply, divide]   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe     [pi, multiply, divide]   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20  [divide, power, multiply]   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20  [divide, power, multiply]   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20  [divide, power, multiply]   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be            [log, multiply]   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be            [log, multiply]   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be            [log, multiply]   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1                    [power]   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1                    [power]   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1                    [power]   \n",
-       "\n",
-       "                                                                         outputs.output  \\\n",
-       "example_id                                                                                \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599  The sum of 2 and 3 in this alternate mathemati...   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599               The result of adding 2 and 3 is 6.2.   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                                                6.2   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                   You ate a total of 32.34 fruits.   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                               You ate 16.2 fruits.   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                                              32.34   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11  The result of subtracting 3 from 2 in this alt...   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11          The result of subtracting 3 from 2 is -4.   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                                               -4.0   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd  The result of evaluating -5 using the negate f...   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd  The result of evaluating -5 using the negate f...   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd                                                  5   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449  The result of dividing the sine of 1.5 radians...   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449                The result is 0.035457422151326225.   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449                               0.035457422151326225   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52  The result of 101 to the power of 0.5 to 4 dig...   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52                                         3.8109e+37   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52                                             102519   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe        15 degrees is approximately 0.0417 radians.   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe        15 degrees is approximately 0.0417 radians.   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe                                         0.28797945   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20  After 2 hours, starting with 5 cells, there wi...   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20           After 2 hours, there will be 2187 cells.   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20                                              352.0   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be  The result of multiplying the logarithm of 100...   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be                    The result is 6.222318693323366   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be                                               19.8   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1           The result of 2 to the power of 3 is 32.   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1           The result of 2 to the power of 3 is 32.   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1                                               32.0   \n",
-       "\n",
-       "                                      reference.reference  \\\n",
-       "example_id                                                  \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599             6.200000   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599             6.200000   \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599             6.200000   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2            32.340000   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2            32.340000   \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2            32.340000   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11            -4.000000   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11            -4.000000   \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11            -4.000000   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd            -5.000000   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd            -5.000000   \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd            -5.000000   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449             0.035457   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449             0.035457   \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449             0.035457   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52        102518.781200   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52        102518.781200   \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52        102518.781200   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe             0.124588   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe             0.124588   \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe             0.124588   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20           176.000000   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20           176.000000   \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20           176.000000   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be             6.222319   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be             6.222319   \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be             6.222319   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1            32.000000   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1            32.000000   \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1            32.000000   \n",
-       "\n",
-       "                                      feedback.correctness  num_expected_steps  \n",
-       "example_id                                                                      \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                   1.0                   1  \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                   1.0                   1  \n",
-       "20ea2f0e-b306-474a-8daa-f4386cc16599                   1.0                   1  \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                   1.0                   2  \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                   0.0                   2  \n",
-       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                   1.0                   2  \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                   1.0                   1  \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                   1.0                   1  \n",
-       "c857031a-6ab1-4b06-9638-3a8a4ba69f11                   1.0                   1  \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd                   1.0                   1  \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd                   1.0                   1  \n",
-       "75db51d4-5c3b-4312-9eb9-b40c74eafdcd                   0.0                   1  \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449                   1.0                   3  \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449                   1.0                   3  \n",
-       "2a20a13d-050e-4a16-84ff-22d9582f1449                   1.0                   3  \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52                   0.0                   2  \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52                   0.0                   2  \n",
-       "4ac33c1a-62f0-4da4-9455-07b582f6ff52                   0.0                   2  \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe                   0.0                   3  \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe                   0.0                   3  \n",
-       "2e82a924-8382-425e-8738-daa2d912e9fe                   0.0                   3  \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20                   1.0                   3  \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20                   0.0                   3  \n",
-       "67867526-791a-452f-b534-ef2c1f5efd20                   0.0                   3  \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be                   1.0                   2  \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be                   1.0                   2  \n",
-       "27c44572-6c67-4129-a95a-fe1509c350be                   0.0                   2  \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1                   1.0                   1  \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1                   1.0                   1  \n",
-       "dd079541-c0da-4d94-85b7-50f0516a9ca1                   1.0                   1  "
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[columns].sort_values(by=[\"input.question\", \"model\"])"
+    "## Benchmarking\n",
+    "\n",
+    "See `introduction` and `benchmark all` for information on how to run benchmarks. This notebook is just to here to explain and explore the task."
    ]
   }
  ],
diff --git a/docs/source/notebooks/tool_usage/relational_data.ipynb b/docs/source/notebooks/tool_usage/relational_data.ipynb
index 1c781d7..5a4b253 100644
--- a/docs/source/notebooks/tool_usage/relational_data.ipynb
+++ b/docs/source/notebooks/tool_usage/relational_data.ipynb
@@ -88,7 +88,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "27b6b0fd-639d-43a7-a730-9acdc5b2f102",
    "metadata": {
     "tags": []
@@ -97,14 +97,14 @@
     {
      "data": {
       "text/plain": [
-       "[StructuredTool(name='get_user_name', description=\"get_user_name(user_id: int) -> str - Get the name of the user with the given user ID.\\n\\n        Args:\\n            user_id: The user's ID.\\n\\n        Returns:\\n            The user's name.\", args_schema=<class 'pydantic.v1.main.get_user_nameSchemaSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.get_user_name at 0x7fbb0e864f40>),\n",
-       " StructuredTool(name='list_user_ids', description='list_user_ids() -> List[str] - List all the user IDs.', args_schema=<class 'pydantic.v1.main.list_user_idsSchemaSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.list_user_ids at 0x7fbb0e864fe0>),\n",
-       " StructuredTool(name='find_users_by_name', description='find_users_by_name(name: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find users with the given name.\\n\\n        Args:\\n            name: The name to search for.\\n\\n        Returns:\\n            The list of matching users.', args_schema=<class 'pydantic.v1.main.find_users_by_nameSchemaSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_users_by_name at 0x7fbb0e865080>),\n",
-       " StructuredTool(name='find_locations_by_name', description='find_locations_by_name(city: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find locations with the given city name.', args_schema=<class 'pydantic.v1.main.find_locations_by_nameSchemaSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_locations_by_name at 0x7fbb0e865120>),\n",
-       " StructuredTool(name='find_foods_by_name', description='find_foods_by_name(food: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find foods with the given name.', args_schema=<class 'pydantic.v1.main.find_foods_by_nameSchemaSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_foods_by_name at 0x7fbb0e8651c0>)]"
+       "[StructuredTool(name='get_user_name', description=\"get_user_name(user_id: int) -> str - Get the name of the user with the given user ID.\\n\\n        Args:\\n            user_id: The user's ID.\\n\\n        Returns:\\n            The user's name.\", args_schema=<class 'pydantic.v1.main.get_user_nameSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.get_user_name at 0x78f30602fec0>),\n",
+       " StructuredTool(name='list_user_ids', description='list_user_ids() -> List[str] - List all the user IDs.', args_schema=<class 'pydantic.v1.main.list_user_idsSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.list_user_ids at 0x78f30602fe20>),\n",
+       " StructuredTool(name='find_users_by_name', description='find_users_by_name(name: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find users with the given name.\\n\\n        Args:\\n            name: The name to search for.\\n\\n        Returns:\\n            The list of matching users.', args_schema=<class 'pydantic.v1.main.find_users_by_nameSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_users_by_name at 0x78f306058040>),\n",
+       " StructuredTool(name='find_locations_by_name', description='find_locations_by_name(city: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find locations with the given city name.', args_schema=<class 'pydantic.v1.main.find_locations_by_nameSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_locations_by_name at 0x78f3060580e0>),\n",
+       " StructuredTool(name='find_foods_by_name', description='find_foods_by_name(food: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find foods with the given name.', args_schema=<class 'pydantic.v1.main.find_foods_by_nameSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_foods_by_name at 0x78f306058180>)]"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -116,7 +116,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "7f1c1242-449c-4536-863d-b62bf6d2dff1",
    "metadata": {
     "tags": []
@@ -128,7 +128,7 @@
        "'Bob'"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -139,7 +139,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "id": "854e139b-a120-4012-bdf4-6394e0b1c42d",
    "metadata": {
     "tags": []
@@ -155,7 +155,7 @@
        " {'id': 5, 'city': 'Miami'}]"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -169,105 +169,46 @@
    "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
    "metadata": {},
    "source": [
-    "## Agent Factory\n",
+    "## Explore the task\n",
     "\n",
     "For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n",
     "\n",
-    "The `AgentExecutor` should accept `question` as an input and include the fields `output`, `intermediate_steps` and potentially `state` in its response -- for this we\n",
-    "will wrap the agent executor in an adapter (`apply_agent_executor_adapter`) that will help match the expected schema.\n",
-    "\n",
-    "Please reference the LangChain documentation to see how to [use and implement agents](https://python.langchain.com/docs/modules/agents/)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "1c2d80d2-4ddf-4b80-b6c5-331133a85314",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.agents import AgentType, initialize_agent\n",
-    "from langchain.chat_models import ChatOpenAI\n",
-    "\n",
-    "from langchain_benchmarks.schema import ExtractionTask\n",
-    "from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter"
+    "We'll use the `StandardAgentFactory` -- look at the `intro` for more information about what it does and/or how to create a custom one."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "id": "81c0e4a1-f56e-4117-8804-4161c642b068",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "class AgentFactory:\n",
-    "    def __init__(self, task: ExtractionTask, model: str) -> None:\n",
-    "        self.task = task\n",
-    "        self.model = model\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_openai.chat_models import ChatOpenAI\n",
     "\n",
-    "    def __call__(self):\n",
-    "        # This factory creates a new environment for every agent run.\n",
-    "        # The reason is that the environment may be associated with an environment state (e.g., typewriter)\n",
-    "        # which is changed by the actions of the agent.\n",
-    "        # At the end of the run, the environment state will be read.\n",
-    "        env = task.create_environment()  # Create a new environment for every agent run!\n",
-    "        tools = env.tools\n",
-    "        llm = ChatOpenAI(temperature=0, model=self.model)\n",
-    "        agent_executor = initialize_agent(\n",
-    "            tools,\n",
-    "            llm,\n",
-    "            agent=AgentType.OPENAI_FUNCTIONS,\n",
-    "            return_intermediate_steps=True,\n",
-    "            handle_parsing_errors=True,\n",
-    "        )\n",
-    "        # Apply the adapters so that inputs and outputs match dataset schema\n",
-    "        # state_reader automatically adds the state of the environment at the end of the run.\n",
-    "        return apply_agent_executor_adapter(agent_executor, state_reader=env.read_state)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "0ae8c6be-899c-44a6-a89b-0fc04c2cb05c",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "models = [\"gpt-3.5-turbo-1106\", \"gpt-3.5-turbo-0613\", \"gpt-4-32k-0613\"]\n",
-    "agent_factory = AgentFactory(task, models[0])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "87a64f76-65ae-4367-b43f-f2be3431e7af",
-   "metadata": {},
-   "source": [
-    "Let's test that our agent works"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "127a8aa5-839c-469c-a870-7b498f37c187",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain import globals\n",
+    "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n",
+    "\n",
+    "model = ChatOpenAI(temperature=0)\n",
+    "prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\"system\", \"{instructions}\"),  # Populated from task.instructions automatically\n",
+    "        (\"human\", \"{question}\"),  # Populated from the test data\n",
+    "        (\n",
+    "            \"placeholder\",\n",
+    "            \"{agent_scratchpad}\",\n",
+    "        ),  # Work where the agent can do its work (e.g., call multiple tools)\n",
+    "    ]\n",
+    ")\n",
     "\n",
-    "globals.set_verbose(True)"
+    "agent_factory = StandardAgentFactory(task, model, prompt)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 11,
-   "id": "0e4896fa-3633-44a1-857f-80a263cf2e03",
+   "id": "382ff2f6-8099-415e-a58c-e659345f52fc",
    "metadata": {
     "tags": []
    },
@@ -280,11 +221,11 @@
       "\n",
       "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
       "\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `find_locations_by_name` with `{'city': 'Los Angeles'}`\n",
+      "Invoking: `find_locations_by_name` with `{'city': 'LA'}`\n",
       "\n",
       "\n",
-      "\u001b[0m\u001b[36;1m\u001b[1;3m[{'id': 2, 'city': 'Los Angeles'}, {'id': 4, 'city': 'Houston'}, {'id': 1, 'city': 'New York'}, {'id': 3, 'city': 'Chicago'}, {'id': 5, 'city': 'Miami'}]\u001b[0m\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `get_weather_at_location` with `{'location_id': 2}`\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3m[{'id': 2, 'city': 'Los Angeles'}, {'id': 1, 'city': 'New York'}, {'id': 3, 'city': 'Chicago'}, {'id': 4, 'city': 'Houston'}, {'id': 5, 'city': 'Miami'}]\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `get_current_weather_for_location` with `{'location_id': 2}`\n",
       "\n",
       "\n",
       "\u001b[0m\u001b[36;1m\u001b[1;3mSunny, Temperature: 75°F\u001b[0m\u001b[32;1m\u001b[1;3mThe weather in Los Angeles is sunny with a temperature of 75°F.\u001b[0m\n",
@@ -295,15 +236,15 @@
     {
      "data": {
       "text/plain": [
-       "{'input': 'whats the weather in LA?',\n",
+       "{'question': 'what is the weather in LA',\n",
        " 'output': 'The weather in Los Angeles is sunny with a temperature of 75°F.',\n",
-       " 'intermediate_steps': [(AgentActionMessageLog(tool='find_locations_by_name', tool_input={'city': 'Los Angeles'}, log=\"\\nInvoking: `find_locations_by_name` with `{'city': 'Los Angeles'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\"city\":\"Los Angeles\"}', 'name': 'find_locations_by_name'}})]),\n",
+       " 'intermediate_steps': [(ToolAgentAction(tool='find_locations_by_name', tool_input={'city': 'LA'}, log=\"\\nInvoking: `find_locations_by_name` with `{'city': 'LA'}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_hJrCZgP4eDgaj6s4RtCKXTOo', 'function': {'arguments': '{\"city\":\"LA\"}', 'name': 'find_locations_by_name'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-23ccffb0-3b17-46a4-b42e-5eaa3220b211', tool_calls=[{'name': 'find_locations_by_name', 'args': {'city': 'LA'}, 'id': 'call_hJrCZgP4eDgaj6s4RtCKXTOo'}], tool_call_chunks=[{'name': 'find_locations_by_name', 'args': '{\"city\":\"LA\"}', 'id': 'call_hJrCZgP4eDgaj6s4RtCKXTOo', 'index': 0}])], tool_call_id='call_hJrCZgP4eDgaj6s4RtCKXTOo'),\n",
        "   [{'id': 2, 'city': 'Los Angeles'},\n",
-       "    {'id': 4, 'city': 'Houston'},\n",
        "    {'id': 1, 'city': 'New York'},\n",
        "    {'id': 3, 'city': 'Chicago'},\n",
+       "    {'id': 4, 'city': 'Houston'},\n",
        "    {'id': 5, 'city': 'Miami'}]),\n",
-       "  (AgentActionMessageLog(tool='get_weather_at_location', tool_input={'location_id': 2}, log=\"\\nInvoking: `get_weather_at_location` with `{'location_id': 2}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\"location_id\":2}', 'name': 'get_weather_at_location'}})]),\n",
+       "  (ToolAgentAction(tool='get_current_weather_for_location', tool_input={'location_id': 2}, log=\"\\nInvoking: `get_current_weather_for_location` with `{'location_id': 2}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_lopYjo00MF9mZtnHtiisTqyp', 'function': {'arguments': '{\"location_id\":2}', 'name': 'get_current_weather_for_location'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9bba5827-d98b-464d-8028-25eb4a05d227', tool_calls=[{'name': 'get_current_weather_for_location', 'args': {'location_id': 2}, 'id': 'call_lopYjo00MF9mZtnHtiisTqyp'}], tool_call_chunks=[{'name': 'get_current_weather_for_location', 'args': '{\"location_id\":2}', 'id': 'call_lopYjo00MF9mZtnHtiisTqyp', 'index': 0}])], tool_call_id='call_lopYjo00MF9mZtnHtiisTqyp'),\n",
        "   'Sunny, Temperature: 75°F')]}"
       ]
      },
@@ -313,270 +254,31 @@
     }
    ],
    "source": [
-    "agent = agent_factory()\n",
-    "agent.invoke({\"question\": \"whats the weather in LA?\"})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "43edee23-109d-4f75-be68-d2b4b3240c9b",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "globals.set_verbose(False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
-   "metadata": {},
-   "source": [
-    "## Eval\n",
-    "\n",
-    "Let's evaluate an agent now"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "2e02fb65-eecf-43b8-bf76-1e86ca535da0",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "View the evaluation results for project 'tool-usage-relational-data-gpt-3.5-turbo-1106-8258' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/8aae8e36-720a-42c8-8540-5d5475e7181e?eval=true\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[------------------------------------------------->] 21/21\n",
-      "View the evaluation results for project 'tool-usage-relational-data-gpt-3.5-turbo-0613-8258' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/d8773df1-b054-41e4-a947-7b256ca8738b?eval=true\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[------------------------------------------------->] 21/21\n",
-      "View the evaluation results for project 'tool-usage-relational-data-gpt-4-0613-8258' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/090fecae-923f-4281-93f7-2c5253a2a2a4?eval=true\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[------------------------------------------------->] 21/21"
-     ]
-    }
-   ],
-   "source": [
-    "import uuid\n",
-    "\n",
-    "from langsmith.client import Client\n",
-    "\n",
-    "from langchain_benchmarks import clone_public_dataset\n",
-    "\n",
-    "clone_public_dataset(task.dataset_id, dataset_name=task.name)  # Clone dataset\n",
-    "\n",
-    "experiment_uuid = uuid.uuid4().hex[:4]\n",
-    "\n",
-    "client = Client()\n",
+    "from langchain import globals\n",
     "\n",
-    "models = [\"gpt-3.5-turbo-1106\", \"gpt-3.5-turbo-0613\", \"gpt-4-0613\"]\n",
+    "globals.set_verbose(True)\n",
     "\n",
-    "for model in models:\n",
-    "    print()\n",
-    "    agent_factory = AgentFactory(task, model=model)\n",
-    "    test_run = client.run_on_dataset(\n",
-    "        dataset_name=task.name,\n",
-    "        llm_or_chain_factory=agent_factory,\n",
-    "        evaluation=task.get_eval_config(),\n",
-    "        verbose=False,\n",
-    "        project_name=f\"tool-usage-relational-data-{model}-{experiment_uuid}\",\n",
-    "        tags=[model],\n",
-    "        project_metadata={\n",
-    "            \"model\": model,\n",
-    "            \"arch\": \"openai-functions-agent\",\n",
-    "            \"id\": experiment_uuid,\n",
-    "        },\n",
-    "    )"
+    "agent = agent_factory()\n",
+    "agent.invoke({\"question\": \"what is the weather in LA\"})"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
+   "id": "142ac640-3ce0-4f38-89cd-8d24d65997e4",
    "metadata": {},
    "source": [
-    "## Inspect\n",
+    "## Benchmarking\n",
     "\n",
-    "Here, we'll take a look at the underlying results a little bit."
+    "See `introduction` and `benchmark all` for information on how to run benchmarks. This notebook is just to here to explain and explore the task."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "id": "fe9b20c4-9da0-47a2-95a3-b5660a54855a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from langsmith.client import Client\n",
-    "\n",
-    "client = Client()\n",
-    "projects = list(\n",
-    "    client.list_projects(reference_dataset_name=\"Tool Usage - Relational Data\")\n",
-    ")\n",
-    "\n",
-    "dfs = []\n",
-    "for project in projects:\n",
-    "    first_root_run = next(\n",
-    "        client.list_runs(project_name=project.name, execution_order=1)\n",
-    "    )\n",
-    "    # Temporary way to get tag information\n",
-    "    tags = first_root_run.tags\n",
-    "    test_results = client.get_test_results(project_name=project.name)\n",
-    "    test_results[\"model\"] = tags[0]\n",
-    "    dfs.append(test_results)\n",
-    "\n",
-    "\n",
-    "df = pd.concat(dfs)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "da6962a1-81f2-445f-8547-513a105a3847",
-   "metadata": {},
-   "source": [
-    "### Stats"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4b7d366a-8754-417a-a654-956528f134e2",
+   "execution_count": null,
+   "id": "e49455cc-13c5-4ea6-bb4b-e61c39ea0267",
    "metadata": {},
-   "source": [
-    "In terms of function usage, gpt-4 uses more calls than is strictly necessary (`feedback.# steps / # expected steps` is > 1). However, it's doing a pretty good job.\n",
-    "\n",
-    "The gpt-3.5 models do not use tools enough (`feedback.# steps / # expected steps` is < 1) and as a result do a worse job at the task.\n",
-    "\n",
-    "Note: The intermediate step correctness happens to have the same average for the 3 models -- this is just a coincidence you can confirm by inspecting underlying results."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "066551f2-eb30-4bc1-94fd-0ca0085103ad",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>feedback.correctness</th>\n",
-       "      <th>feedback.Intermediate steps correctness</th>\n",
-       "      <th>execution_time</th>\n",
-       "      <th>feedback.# steps / # expected steps</th>\n",
-       "      <th>n</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>model</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>gpt-3.5-turbo-0613</th>\n",
-       "      <td>0.714286</td>\n",
-       "      <td>0.714286</td>\n",
-       "      <td>4.829506</td>\n",
-       "      <td>0.825390</td>\n",
-       "      <td>21</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>gpt-3.5-turbo-1106</th>\n",
-       "      <td>0.857143</td>\n",
-       "      <td>0.714286</td>\n",
-       "      <td>5.464218</td>\n",
-       "      <td>0.965871</td>\n",
-       "      <td>21</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>gpt-4-0613</th>\n",
-       "      <td>0.952381</td>\n",
-       "      <td>0.714286</td>\n",
-       "      <td>8.544358</td>\n",
-       "      <td>1.037300</td>\n",
-       "      <td>21</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                    feedback.correctness  \\\n",
-       "model                                      \n",
-       "gpt-3.5-turbo-0613              0.714286   \n",
-       "gpt-3.5-turbo-1106              0.857143   \n",
-       "gpt-4-0613                      0.952381   \n",
-       "\n",
-       "                    feedback.Intermediate steps correctness  execution_time  \\\n",
-       "model                                                                         \n",
-       "gpt-3.5-turbo-0613                                 0.714286        4.829506   \n",
-       "gpt-3.5-turbo-1106                                 0.714286        5.464218   \n",
-       "gpt-4-0613                                         0.714286        8.544358   \n",
-       "\n",
-       "                    feedback.# steps / # expected steps   n  \n",
-       "model                                                        \n",
-       "gpt-3.5-turbo-0613                             0.825390  21  \n",
-       "gpt-3.5-turbo-1106                             0.965871  21  \n",
-       "gpt-4-0613                                     1.037300  21  "
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "count_df = df.groupby(\"model\").size().to_frame(\"n\")\n",
-    "df.groupby(\"model\")[\n",
-    "    [\n",
-    "        \"feedback.correctness\",\n",
-    "        \"feedback.Intermediate steps correctness\",\n",
-    "        \"execution_time\",\n",
-    "        \"feedback.# steps / # expected steps\",\n",
-    "    ]\n",
-    "].mean().join(count_df)"
-   ]
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/docs/source/notebooks/tool_usage/typewriter_1.ipynb b/docs/source/notebooks/tool_usage/typewriter_1.ipynb
index 9f1a2d2..93a21b3 100644
--- a/docs/source/notebooks/tool_usage/typewriter_1.ipynb
+++ b/docs/source/notebooks/tool_usage/typewriter_1.ipynb
@@ -23,7 +23,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 1,
    "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
    "metadata": {
     "tags": []
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 2,
    "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece",
    "metadata": {
     "tags": []
@@ -60,10 +60,10 @@
        "</table>"
       ],
       "text/plain": [
-       "ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x7f1791bd2480>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'})"
+       "ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x73a65909da80>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'})"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -110,7 +110,7 @@
     {
      "data": {
       "text/plain": [
-       "[StructuredTool(name='type_letter', description='type_letter(letter: str) -> str - Print the given letter on the paper.', args_schema=<class 'pydantic.v1.main.type_letterSchemaSchema'>, func=<function create_typer.<locals>.type_letter at 0x7f1791bd3f60>)]"
+       "[StructuredTool(name='type_letter', description='type_letter(letter: str) -> str - Print the given letter on the paper.', args_schema=<class 'pydantic.v1.main.type_letterSchema'>, func=<function create_typer.<locals>.type_letter at 0x73a65909ee80>)]"
       ]
      },
      "execution_count": 4,
@@ -208,1404 +208,119 @@
    "id": "cd13d120-1bf9-481c-9392-c15ebdd9d77f",
    "metadata": {},
    "source": [
-    "## Agent Factory\n",
+    "## Explore the task\n",
     "\n",
     "For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n",
     "\n",
-    "We'll use an `OpenAIAgentFactory` provided with LangChain Benchmarks -- look at the `intro` section to see how to define your own."
+    "We'll use the `StandardAgentFactory` -- look at the `intro` for more information about what it does and/or how to create a custom one."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 23,
    "id": "e2acab1e-78a7-4198-8e79-4529c95ce7e2",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "from langchain_benchmarks.tool_usage import agents\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_openai.chat_models import ChatOpenAI\n",
+    "\n",
+    "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n",
     "\n",
-    "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n",
+    "model = ChatOpenAI(temperature=0)\n",
+    "prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\"system\", \"{instructions}\"),  # Populated from task.instructions automatically\n",
+    "        (\"human\", \"{question}\"),  # Populated from the test data\n",
+    "        (\n",
+    "            \"placeholder\",\n",
+    "            \"{agent_scratchpad}\",\n",
+    "        ),  # Work where the agent can do its work (e.g., call multiple tools)\n",
+    "    ]\n",
+    ")\n",
     "\n",
-    "# Let's test that our agent works\n",
-    "agent = agent_factory()"
+    "agent_factory = StandardAgentFactory(task, model, prompt)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 25,
    "id": "ceaa8edf-292b-48a1-be94-e6bfea0e75b1",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'input': 'abc',\n",
-       " 'output': 'a, b, c',\n",
-       " 'intermediate_steps': [(OpenAIToolAgentAction(tool='type_letter', tool_input={'letter': 'a'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'a'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_uXlSgkG7N9nBCjYPB6SZn0n4', 'function': {'arguments': '{\\n  \"letter\": \"a\"\\n}', 'name': 'type_letter'}, 'type': 'function'}]})], tool_call_id='call_uXlSgkG7N9nBCjYPB6SZn0n4'),\n",
-       "   'OK'),\n",
-       "  (OpenAIToolAgentAction(tool='type_letter', tool_input={'letter': 'b'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'b'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_dbSJb120AxFn55XcJHR0xH1I', 'function': {'arguments': '{\\n  \"letter\": \"b\"\\n}', 'name': 'type_letter'}, 'type': 'function'}]})], tool_call_id='call_dbSJb120AxFn55XcJHR0xH1I'),\n",
-       "   'OK'),\n",
-       "  (OpenAIToolAgentAction(tool='type_letter', tool_input={'letter': 'c'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'c'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_sFV4km9Jd9BOGO7A3oo1op0b', 'function': {'arguments': '{\\n  \"letter\": \"c\"\\n}', 'name': 'type_letter'}, 'type': 'function'}]})], tool_call_id='call_sFV4km9Jd9BOGO7A3oo1op0b'),\n",
-       "   'OK')],\n",
-       " 'state': 'abc'}"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "agent.invoke({\"question\": \"abc\"})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
-   "metadata": {},
-   "source": [
-    "## Eval"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bc860fc6-89db-4929-926a-69b6320616ab",
-   "metadata": {},
-   "source": [
-    "Let's evaluate an agent now"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
-   "metadata": {
-    "tags": []
-   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "\n",
-      "View the evaluation results for project 'typewriter-1-gpt-3.5-turbo-1106-7709' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/d29cf7d9-9cfa-4fcd-8380-8c339b940972?eval=true\n",
       "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n",
-      "[------------------------------------------------->] 20/20\n",
-      "View the evaluation results for project 'typewriter-1-gpt-3.5-turbo-0613-7709' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/044be5ad-0871-4b08-bf5c-1dd6ba94f53b?eval=true\n",
+      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
+      "\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `type_letter` with `{'letter': 'a'}`\n",
+      "\n",
       "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n",
-      "[------------------------------------------------->] 20/20\n",
-      "View the evaluation results for project 'typewriter-1-gpt-4-0613-7709' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/b7ec2c5f-2a28-4bf7-828e-7a65ea5984be?eval=true\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `type_letter` with `{'letter': 'b'}`\n",
       "\n",
-      "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n",
-      "[------------------------------------------------->] 20/20"
+      "\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `type_letter` with `{'letter': 'c'}`\n",
+      "\n",
+      "\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3mabc\u001b[0m\n",
+      "\n",
+      "\u001b[1m> Finished chain.\u001b[0m\n"
      ]
-    }
-   ],
-   "source": [
-    "import uuid\n",
-    "\n",
-    "from langsmith.client import Client\n",
-    "\n",
-    "from langchain_benchmarks import clone_public_dataset\n",
-    "\n",
-    "experiment_uuid = uuid.uuid4().hex[:4]\n",
-    "clone_public_dataset(task.dataset_id, dataset_name=task.name)\n",
-    "\n",
-    "\n",
-    "client = Client()\n",
-    "\n",
-    "models = [\"gpt-3.5-turbo-1106\", \"gpt-3.5-turbo-0613\", \"gpt-4-0613\"]\n",
-    "\n",
-    "for model in models:\n",
-    "    # Will evaluate the trajectory and state, but not the output which is meaningless for this task.\n",
-    "    print()\n",
-    "    agent_factory = agents.OpenAIAgentFactory(task, model=model)\n",
-    "    test_run = client.run_on_dataset(\n",
-    "        dataset_name=task.name,\n",
-    "        llm_or_chain_factory=agent_factory,\n",
-    "        evaluation=task.get_eval_config(),\n",
-    "        verbose=False,\n",
-    "        project_name=f\"typewriter-1-{model}-{experiment_uuid}\",\n",
-    "        tags=[model],\n",
-    "        project_metadata={\n",
-    "            \"model\": model,\n",
-    "            \"arch\": \"openai-functions-agent\",\n",
-    "            \"id\": experiment_uuid,\n",
-    "        },\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
-   "metadata": {},
-   "source": [
-    "## Inspect\n",
-    "\n",
-    "You can take a look at the underlying results."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "72c07e4d-3e3b-4838-81d4-98d2e7cfe8d7",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from langsmith.client import Client\n",
-    "\n",
-    "client = Client()\n",
-    "projects = list(\n",
-    "    client.list_projects(reference_dataset_name=\"Tool Usage - Typewriter (1 tool)\")\n",
-    ")\n",
-    "\n",
-    "dfs = []\n",
-    "for project in projects:\n",
-    "    first_root_run = next(\n",
-    "        client.list_runs(project_name=project.name, execution_order=1)\n",
-    "    )\n",
-    "    # Temporary way to get tag information\n",
-    "    tags = first_root_run.tags\n",
-    "    test_results = client.get_test_results(project_name=project.name)\n",
-    "    test_results[\"model\"] = tags[0]\n",
-    "    dfs.append(test_results)\n",
-    "\n",
-    "\n",
-    "df = pd.concat(dfs)\n",
-    "\n",
-    "df[\"actual_steps\"] = df[\"outputs.intermediate_steps\"].apply(\n",
-    "    lambda steps: [step[0][\"tool\"] for step in steps]\n",
-    ")\n",
-    "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n",
-    "df[\"num_actual_steps\"] = df[\"actual_steps\"].apply(len)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e7b027a9-87ef-4a97-8b91-43eb82671c6c",
-   "metadata": {},
-   "source": [
-    "### Stats\n",
-    "\n",
-    "This is a simple task that involves using a single tool that takes only one argument (which character to type).\n",
-    "\n",
-    "Given the simplicity of the task, we expect that all models will be able to do well at this task (ideally at 100%)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "b511c2af-9261-46fb-8f29-b8491b198e87",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
+    },
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>feedback.Correct Final State</th>\n",
-       "      <th>feedback.Intermediate steps correctness</th>\n",
-       "      <th>execution_time</th>\n",
-       "      <th>feedback.# steps / # expected steps</th>\n",
-       "      <th># correct</th>\n",
-       "      <th>n</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>model</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>gpt-3.5-turbo-0613</th>\n",
-       "      <td>0.95</td>\n",
-       "      <td>0.95</td>\n",
-       "      <td>18.880388</td>\n",
-       "      <td>1.700000</td>\n",
-       "      <td>19.0</td>\n",
-       "      <td>20</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>gpt-3.5-turbo-1106</th>\n",
-       "      <td>0.90</td>\n",
-       "      <td>0.75</td>\n",
-       "      <td>22.471857</td>\n",
-       "      <td>1.012455</td>\n",
-       "      <td>18.0</td>\n",
-       "      <td>20</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>gpt-4-0613</th>\n",
-       "      <td>0.90</td>\n",
-       "      <td>0.90</td>\n",
-       "      <td>22.663781</td>\n",
-       "      <td>1.093750</td>\n",
-       "      <td>18.0</td>\n",
-       "      <td>20</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "                    feedback.Correct Final State  \\\n",
-       "model                                              \n",
-       "gpt-3.5-turbo-0613                          0.95   \n",
-       "gpt-3.5-turbo-1106                          0.90   \n",
-       "gpt-4-0613                                  0.90   \n",
-       "\n",
-       "                    feedback.Intermediate steps correctness  execution_time  \\\n",
-       "model                                                                         \n",
-       "gpt-3.5-turbo-0613                                     0.95       18.880388   \n",
-       "gpt-3.5-turbo-1106                                     0.75       22.471857   \n",
-       "gpt-4-0613                                             0.90       22.663781   \n",
-       "\n",
-       "                    feedback.# steps / # expected steps  # correct   n  \n",
-       "model                                                                   \n",
-       "gpt-3.5-turbo-0613                             1.700000       19.0  20  \n",
-       "gpt-3.5-turbo-1106                             1.012455       18.0  20  \n",
-       "gpt-4-0613                                     1.093750       18.0  20  "
+       "{'question': 'abc',\n",
+       " 'output': 'abc',\n",
+       " 'intermediate_steps': [(ToolAgentAction(tool='type_letter', tool_input={'letter': 'a'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'a'}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'function': {'arguments': '{\"letter\": \"a\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 1, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'function': {'arguments': '{\"letter\": \"b\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 2, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'function': {'arguments': '{\"letter\": \"c\"}', 'name': 'type_letter'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-7d6be045-b9e2-4f24-991c-8e34ccd53b98', tool_calls=[{'name': 'type_letter', 'args': {'letter': 'a'}, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80'}, {'name': 'type_letter', 'args': {'letter': 'b'}, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq'}, {'name': 'type_letter', 'args': {'letter': 'c'}, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj'}], tool_call_chunks=[{'name': 'type_letter', 'args': '{\"letter\": \"a\"}', 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'index': 0}, {'name': 'type_letter', 'args': '{\"letter\": \"b\"}', 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'index': 1}, {'name': 'type_letter', 'args': '{\"letter\": \"c\"}', 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'index': 2}])], tool_call_id='call_f4exPQMfz4VWxFJw4LhyMc80'),\n",
+       "   'OK'),\n",
+       "  (ToolAgentAction(tool='type_letter', tool_input={'letter': 'b'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'b'}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'function': {'arguments': '{\"letter\": \"a\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 1, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'function': {'arguments': '{\"letter\": \"b\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 2, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'function': {'arguments': '{\"letter\": \"c\"}', 'name': 'type_letter'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-7d6be045-b9e2-4f24-991c-8e34ccd53b98', tool_calls=[{'name': 'type_letter', 'args': {'letter': 'a'}, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80'}, {'name': 'type_letter', 'args': {'letter': 'b'}, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq'}, {'name': 'type_letter', 'args': {'letter': 'c'}, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj'}], tool_call_chunks=[{'name': 'type_letter', 'args': '{\"letter\": \"a\"}', 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'index': 0}, {'name': 'type_letter', 'args': '{\"letter\": \"b\"}', 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'index': 1}, {'name': 'type_letter', 'args': '{\"letter\": \"c\"}', 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'index': 2}])], tool_call_id='call_DHOJfLJEKuOKdzBa8ZLRYJZq'),\n",
+       "   'OK'),\n",
+       "  (ToolAgentAction(tool='type_letter', tool_input={'letter': 'c'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'c'}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'function': {'arguments': '{\"letter\": \"a\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 1, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'function': {'arguments': '{\"letter\": \"b\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 2, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'function': {'arguments': '{\"letter\": \"c\"}', 'name': 'type_letter'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-7d6be045-b9e2-4f24-991c-8e34ccd53b98', tool_calls=[{'name': 'type_letter', 'args': {'letter': 'a'}, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80'}, {'name': 'type_letter', 'args': {'letter': 'b'}, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq'}, {'name': 'type_letter', 'args': {'letter': 'c'}, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj'}], tool_call_chunks=[{'name': 'type_letter', 'args': '{\"letter\": \"a\"}', 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'index': 0}, {'name': 'type_letter', 'args': '{\"letter\": \"b\"}', 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'index': 1}, {'name': 'type_letter', 'args': '{\"letter\": \"c\"}', 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'index': 2}])], tool_call_id='call_EziJvB6jtUEg3CmXSsQ7OWBj'),\n",
+       "   'OK')],\n",
+       " 'state': 'abc'}"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "correct_df = (\n",
-    "    df.groupby(\"model\")[\"feedback.Correct Final State\"].sum().to_frame(\"# correct\")\n",
-    ")\n",
-    "count_df = df.groupby(\"model\").size().to_frame(\"n\")\n",
+    "from langchain import globals\n",
     "\n",
-    "columns = [\n",
-    "    \"feedback.Correct Final State\",\n",
-    "    \"feedback.Intermediate steps correctness\",\n",
-    "    \"execution_time\",\n",
-    "    \"feedback.# steps / # expected steps\",\n",
-    "]\n",
+    "globals.set_verbose(True)\n",
     "\n",
-    "df.groupby(\"model\")[columns].mean().join(correct_df).join(count_df)"
+    "agent = agent_factory()\n",
+    "agent.invoke({\"question\": \"abc\"})"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "9a311343-1d5a-433b-9ee3-685de301551d",
+   "id": "4729e72c-3903-478a-b298-4a586af33912",
    "metadata": {},
    "source": [
-    "### Individual"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "e0d57162-1626-4acc-88e1-91d4d4041234",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>input.question</th>\n",
-       "      <th>model</th>\n",
-       "      <th>outputs.state</th>\n",
-       "      <th>reference.state</th>\n",
-       "      <th>feedback.Correct Final State</th>\n",
-       "      <th>num_expected_steps</th>\n",
-       "      <th>num_actual_steps</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>example_id</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>89bb564a-ddee-4a36-8a3d-d093eef415ca</th>\n",
-       "      <td>a</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>aaaaaaaaaaaaaaa</td>\n",
-       "      <td>a</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>15</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>89bb564a-ddee-4a36-8a3d-d093eef415ca</th>\n",
-       "      <td>a</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>a</td>\n",
-       "      <td>a</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>89bb564a-ddee-4a36-8a3d-d093eef415ca</th>\n",
-       "      <td>a</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>abc</td>\n",
-       "      <td>a</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5b40cb96-ae09-438e-b940-d24445bb5d67</th>\n",
-       "      <td>aa</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>aa</td>\n",
-       "      <td>aa</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5b40cb96-ae09-438e-b940-d24445bb5d67</th>\n",
-       "      <td>aa</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>aa</td>\n",
-       "      <td>aa</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                     input.question               model  \\\n",
-       "example_id                                                                \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca              a  gpt-3.5-turbo-0613   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca              a  gpt-3.5-turbo-1106   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca              a          gpt-4-0613   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67             aa  gpt-3.5-turbo-0613   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67             aa  gpt-3.5-turbo-1106   \n",
-       "\n",
-       "                                        outputs.state reference.state  \\\n",
-       "example_id                                                              \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca  aaaaaaaaaaaaaaa               a   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                a               a   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca              abc               a   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67               aa              aa   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67               aa              aa   \n",
-       "\n",
-       "                                      feedback.Correct Final State  \\\n",
-       "example_id                                                           \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                           0.0   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                           1.0   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                           0.0   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67                           1.0   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67                           1.0   \n",
-       "\n",
-       "                                      num_expected_steps  num_actual_steps  \n",
-       "example_id                                                                  \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                   1                15  \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                   1                 1  \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                   1                 3  \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67                   2                 2  \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67                   2                 2  "
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "columns = [\n",
-    "    \"input.question\",\n",
-    "    \"model\",\n",
-    "    \"outputs.state\",\n",
-    "    \"reference.state\",\n",
-    "    \"feedback.Correct Final State\",\n",
-    "    \"num_expected_steps\",\n",
-    "    \"num_actual_steps\",\n",
-    "]\n",
-    "df[columns].sort_values(by=[\"input.question\", \"model\"]).head()"
+    "## Benchmarking\n",
+    "\n",
+    "See `introduction` and `benchmark all` for information on how to run benchmarks. This notebook is just to here to explain and explore the task."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
-   "id": "7201d880-d338-40c4-a042-7d5c549cf77a",
-   "metadata": {
-    "tags": [
-     "remove-cell"
-    ]
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>input.question</th>\n",
-       "      <th>model</th>\n",
-       "      <th>outputs.state</th>\n",
-       "      <th>reference.state</th>\n",
-       "      <th>feedback.Correct Final State</th>\n",
-       "      <th>num_expected_steps</th>\n",
-       "      <th>num_actual_steps</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>example_id</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>89bb564a-ddee-4a36-8a3d-d093eef415ca</th>\n",
-       "      <td>a</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>aaaaaaaaaaaaaaa</td>\n",
-       "      <td>a</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>15</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>89bb564a-ddee-4a36-8a3d-d093eef415ca</th>\n",
-       "      <td>a</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>a</td>\n",
-       "      <td>a</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>89bb564a-ddee-4a36-8a3d-d093eef415ca</th>\n",
-       "      <td>a</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>abc</td>\n",
-       "      <td>a</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5b40cb96-ae09-438e-b940-d24445bb5d67</th>\n",
-       "      <td>aa</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>aa</td>\n",
-       "      <td>aa</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5b40cb96-ae09-438e-b940-d24445bb5d67</th>\n",
-       "      <td>aa</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>aa</td>\n",
-       "      <td>aa</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5b40cb96-ae09-438e-b940-d24445bb5d67</th>\n",
-       "      <td>aa</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>aa</td>\n",
-       "      <td>aa</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>288d6483-c618-4e34-9b86-275b490e0975</th>\n",
-       "      <td>aaa</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>aaa</td>\n",
-       "      <td>aaa</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>288d6483-c618-4e34-9b86-275b490e0975</th>\n",
-       "      <td>aaa</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>aaa</td>\n",
-       "      <td>aaa</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>288d6483-c618-4e34-9b86-275b490e0975</th>\n",
-       "      <td>aaa</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>aaa</td>\n",
-       "      <td>aaa</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>915bd4b5-a536-4849-8cb6-8a658407c2c9</th>\n",
-       "      <td>aaaa</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>aaaa</td>\n",
-       "      <td>aaaa</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>915bd4b5-a536-4849-8cb6-8a658407c2c9</th>\n",
-       "      <td>aaaa</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>aaaa</td>\n",
-       "      <td>aaaa</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>915bd4b5-a536-4849-8cb6-8a658407c2c9</th>\n",
-       "      <td>aaaa</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>aaaa</td>\n",
-       "      <td>aaaa</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1cb7a14d-cc7d-44f1-ab47-394f8221abee</th>\n",
-       "      <td>cat</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>cat</td>\n",
-       "      <td>cat</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1cb7a14d-cc7d-44f1-ab47-394f8221abee</th>\n",
-       "      <td>cat</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>cat</td>\n",
-       "      <td>cat</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1cb7a14d-cc7d-44f1-ab47-394f8221abee</th>\n",
-       "      <td>cat</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>cat</td>\n",
-       "      <td>cat</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5b409366-ee6a-4bdb-b842-5e71d3407a05</th>\n",
-       "      <td>church</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>church</td>\n",
-       "      <td>church</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5b409366-ee6a-4bdb-b842-5e71d3407a05</th>\n",
-       "      <td>church</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>church</td>\n",
-       "      <td>church</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>6</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5b409366-ee6a-4bdb-b842-5e71d3407a05</th>\n",
-       "      <td>church</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>church</td>\n",
-       "      <td>church</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a</th>\n",
-       "      <td>communication</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>communication</td>\n",
-       "      <td>communication</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>13</td>\n",
-       "      <td>13</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a</th>\n",
-       "      <td>communication</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>communication</td>\n",
-       "      <td>communication</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>13</td>\n",
-       "      <td>13</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a</th>\n",
-       "      <td>communication</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>communication</td>\n",
-       "      <td>communication</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>13</td>\n",
-       "      <td>13</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5cf28d08-a49f-4a69-8759-b1b774ef74b1</th>\n",
-       "      <td>computer</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>computer</td>\n",
-       "      <td>computer</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>8</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5cf28d08-a49f-4a69-8759-b1b774ef74b1</th>\n",
-       "      <td>computer</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>computer</td>\n",
-       "      <td>computer</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>8</td>\n",
-       "      <td>9</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5cf28d08-a49f-4a69-8759-b1b774ef74b1</th>\n",
-       "      <td>computer</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>computer</td>\n",
-       "      <td>computer</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>8</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9017ddcc-d3bd-45a8-88dd-70906964586b</th>\n",
-       "      <td>dictionary</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>dictionary</td>\n",
-       "      <td>dictionary</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>10</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9017ddcc-d3bd-45a8-88dd-70906964586b</th>\n",
-       "      <td>dictionary</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>diction</td>\n",
-       "      <td>dictionary</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>10</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9017ddcc-d3bd-45a8-88dd-70906964586b</th>\n",
-       "      <td>dictionary</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>dictionary</td>\n",
-       "      <td>dictionary</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>10</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>b1ac4715-a0ad-48f2-8741-949ca23b39eb</th>\n",
-       "      <td>dog</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>dog</td>\n",
-       "      <td>dog</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>b1ac4715-a0ad-48f2-8741-949ca23b39eb</th>\n",
-       "      <td>dog</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>dog</td>\n",
-       "      <td>dog</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>b1ac4715-a0ad-48f2-8741-949ca23b39eb</th>\n",
-       "      <td>dog</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>dog</td>\n",
-       "      <td>dog</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10d42048-ac73-414f-9f50-dba79c3b74a7</th>\n",
-       "      <td>hand</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>hand</td>\n",
-       "      <td>hand</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10d42048-ac73-414f-9f50-dba79c3b74a7</th>\n",
-       "      <td>hand</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>hand</td>\n",
-       "      <td>hand</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10d42048-ac73-414f-9f50-dba79c3b74a7</th>\n",
-       "      <td>hand</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>hand</td>\n",
-       "      <td>hand</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>daf06d4f-9b1d-4f5a-8aa9-09f885a79adb</th>\n",
-       "      <td>head</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>head</td>\n",
-       "      <td>head</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>daf06d4f-9b1d-4f5a-8aa9-09f885a79adb</th>\n",
-       "      <td>head</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>head</td>\n",
-       "      <td>head</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>daf06d4f-9b1d-4f5a-8aa9-09f885a79adb</th>\n",
-       "      <td>head</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>head</td>\n",
-       "      <td>head</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f</th>\n",
-       "      <td>horse</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>horse</td>\n",
-       "      <td>horse</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>5</td>\n",
-       "      <td>5</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f</th>\n",
-       "      <td>horse</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>horse</td>\n",
-       "      <td>horse</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>5</td>\n",
-       "      <td>5</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f</th>\n",
-       "      <td>horse</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>horse</td>\n",
-       "      <td>horse</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>5</td>\n",
-       "      <td>5</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5daad87c-a008-49ab-841c-76916b150f4d</th>\n",
-       "      <td>house</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>house</td>\n",
-       "      <td>house</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>5</td>\n",
-       "      <td>5</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5daad87c-a008-49ab-841c-76916b150f4d</th>\n",
-       "      <td>house</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>house</td>\n",
-       "      <td>house</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>5</td>\n",
-       "      <td>5</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5daad87c-a008-49ab-841c-76916b150f4d</th>\n",
-       "      <td>house</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>house</td>\n",
-       "      <td>house</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>5</td>\n",
-       "      <td>5</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>57e29316-e258-4ed9-bbeb-b23c8bcb4bd2</th>\n",
-       "      <td>information</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>information</td>\n",
-       "      <td>information</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>11</td>\n",
-       "      <td>11</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>57e29316-e258-4ed9-bbeb-b23c8bcb4bd2</th>\n",
-       "      <td>information</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>information!</td>\n",
-       "      <td>information</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>11</td>\n",
-       "      <td>12</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>57e29316-e258-4ed9-bbeb-b23c8bcb4bd2</th>\n",
-       "      <td>information</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>information</td>\n",
-       "      <td>information</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>11</td>\n",
-       "      <td>11</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5ad3a4b3-5207-4a1c-9423-e6cdc3658e49</th>\n",
-       "      <td>keyboard</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>keyboard</td>\n",
-       "      <td>keyboard</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>8</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5ad3a4b3-5207-4a1c-9423-e6cdc3658e49</th>\n",
-       "      <td>keyboard</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>keyboard</td>\n",
-       "      <td>keyboard</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>8</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5ad3a4b3-5207-4a1c-9423-e6cdc3658e49</th>\n",
-       "      <td>keyboard</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>eyboard</td>\n",
-       "      <td>keyboard</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>8</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>a4ffa874-b03b-40ed-b360-d17c963ef27e</th>\n",
-       "      <td>school</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>school</td>\n",
-       "      <td>school</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>a4ffa874-b03b-40ed-b360-d17c963ef27e</th>\n",
-       "      <td>school</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>school</td>\n",
-       "      <td>school</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>6</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>a4ffa874-b03b-40ed-b360-d17c963ef27e</th>\n",
-       "      <td>school</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>school</td>\n",
-       "      <td>school</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8a2b5450-dd16-4213-8b70-cb2583d6c7eb</th>\n",
-       "      <td>student</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>student</td>\n",
-       "      <td>student</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>7</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8a2b5450-dd16-4213-8b70-cb2583d6c7eb</th>\n",
-       "      <td>student</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>student</td>\n",
-       "      <td>student</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>7</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8a2b5450-dd16-4213-8b70-cb2583d6c7eb</th>\n",
-       "      <td>student</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>student</td>\n",
-       "      <td>student</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>7</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>223f250b-9c33-4aed-adfd-791547b44d3d</th>\n",
-       "      <td>teacher</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>teacher</td>\n",
-       "      <td>teacher</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>7</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>223f250b-9c33-4aed-adfd-791547b44d3d</th>\n",
-       "      <td>teacher</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>teacher</td>\n",
-       "      <td>teacher</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>7</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>223f250b-9c33-4aed-adfd-791547b44d3d</th>\n",
-       "      <td>teacher</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>teacher</td>\n",
-       "      <td>teacher</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>7</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a</th>\n",
-       "      <td>university</td>\n",
-       "      <td>gpt-3.5-turbo-0613</td>\n",
-       "      <td>university</td>\n",
-       "      <td>university</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>10</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a</th>\n",
-       "      <td>university</td>\n",
-       "      <td>gpt-3.5-turbo-1106</td>\n",
-       "      <td>university</td>\n",
-       "      <td>university</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>10</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a</th>\n",
-       "      <td>university</td>\n",
-       "      <td>gpt-4-0613</td>\n",
-       "      <td>university</td>\n",
-       "      <td>university</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>10</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                     input.question               model  \\\n",
-       "example_id                                                                \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca              a  gpt-3.5-turbo-0613   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca              a  gpt-3.5-turbo-1106   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca              a          gpt-4-0613   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67             aa  gpt-3.5-turbo-0613   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67             aa  gpt-3.5-turbo-1106   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67             aa          gpt-4-0613   \n",
-       "288d6483-c618-4e34-9b86-275b490e0975            aaa  gpt-3.5-turbo-0613   \n",
-       "288d6483-c618-4e34-9b86-275b490e0975            aaa  gpt-3.5-turbo-1106   \n",
-       "288d6483-c618-4e34-9b86-275b490e0975            aaa          gpt-4-0613   \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9           aaaa  gpt-3.5-turbo-0613   \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9           aaaa  gpt-3.5-turbo-1106   \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9           aaaa          gpt-4-0613   \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee            cat  gpt-3.5-turbo-0613   \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee            cat  gpt-3.5-turbo-1106   \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee            cat          gpt-4-0613   \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05         church  gpt-3.5-turbo-0613   \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05         church  gpt-3.5-turbo-1106   \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05         church          gpt-4-0613   \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a  communication  gpt-3.5-turbo-0613   \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a  communication  gpt-3.5-turbo-1106   \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a  communication          gpt-4-0613   \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1       computer  gpt-3.5-turbo-0613   \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1       computer  gpt-3.5-turbo-1106   \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1       computer          gpt-4-0613   \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b     dictionary  gpt-3.5-turbo-0613   \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b     dictionary  gpt-3.5-turbo-1106   \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b     dictionary          gpt-4-0613   \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb            dog  gpt-3.5-turbo-0613   \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb            dog  gpt-3.5-turbo-1106   \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb            dog          gpt-4-0613   \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7           hand  gpt-3.5-turbo-0613   \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7           hand  gpt-3.5-turbo-1106   \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7           hand          gpt-4-0613   \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb           head  gpt-3.5-turbo-0613   \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb           head  gpt-3.5-turbo-1106   \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb           head          gpt-4-0613   \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f          horse  gpt-3.5-turbo-0613   \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f          horse  gpt-3.5-turbo-1106   \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f          horse          gpt-4-0613   \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d          house  gpt-3.5-turbo-0613   \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d          house  gpt-3.5-turbo-1106   \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d          house          gpt-4-0613   \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2    information  gpt-3.5-turbo-0613   \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2    information  gpt-3.5-turbo-1106   \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2    information          gpt-4-0613   \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49       keyboard  gpt-3.5-turbo-0613   \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49       keyboard  gpt-3.5-turbo-1106   \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49       keyboard          gpt-4-0613   \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e         school  gpt-3.5-turbo-0613   \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e         school  gpt-3.5-turbo-1106   \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e         school          gpt-4-0613   \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb        student  gpt-3.5-turbo-0613   \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb        student  gpt-3.5-turbo-1106   \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb        student          gpt-4-0613   \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d        teacher  gpt-3.5-turbo-0613   \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d        teacher  gpt-3.5-turbo-1106   \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d        teacher          gpt-4-0613   \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a     university  gpt-3.5-turbo-0613   \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a     university  gpt-3.5-turbo-1106   \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a     university          gpt-4-0613   \n",
-       "\n",
-       "                                        outputs.state reference.state  \\\n",
-       "example_id                                                              \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca  aaaaaaaaaaaaaaa               a   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                a               a   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca              abc               a   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67               aa              aa   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67               aa              aa   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67               aa              aa   \n",
-       "288d6483-c618-4e34-9b86-275b490e0975              aaa             aaa   \n",
-       "288d6483-c618-4e34-9b86-275b490e0975              aaa             aaa   \n",
-       "288d6483-c618-4e34-9b86-275b490e0975              aaa             aaa   \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9             aaaa            aaaa   \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9             aaaa            aaaa   \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9             aaaa            aaaa   \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee              cat             cat   \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee              cat             cat   \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee              cat             cat   \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05           church          church   \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05           church          church   \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05           church          church   \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a    communication   communication   \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a    communication   communication   \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a    communication   communication   \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1         computer        computer   \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1         computer        computer   \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1         computer        computer   \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b       dictionary      dictionary   \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b          diction      dictionary   \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b       dictionary      dictionary   \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb              dog             dog   \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb              dog             dog   \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb              dog             dog   \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7             hand            hand   \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7             hand            hand   \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7             hand            hand   \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb             head            head   \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb             head            head   \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb             head            head   \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f            horse           horse   \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f            horse           horse   \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f            horse           horse   \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d            house           house   \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d            house           house   \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d            house           house   \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2      information     information   \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2     information!     information   \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2      information     information   \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49         keyboard        keyboard   \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49         keyboard        keyboard   \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49          eyboard        keyboard   \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e           school          school   \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e           school          school   \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e           school          school   \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb          student         student   \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb          student         student   \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb          student         student   \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d          teacher         teacher   \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d          teacher         teacher   \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d          teacher         teacher   \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a       university      university   \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a       university      university   \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a       university      university   \n",
-       "\n",
-       "                                      feedback.Correct Final State  \\\n",
-       "example_id                                                           \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                           0.0   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                           1.0   \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                           0.0   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67                           1.0   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67                           1.0   \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67                           1.0   \n",
-       "288d6483-c618-4e34-9b86-275b490e0975                           1.0   \n",
-       "288d6483-c618-4e34-9b86-275b490e0975                           1.0   \n",
-       "288d6483-c618-4e34-9b86-275b490e0975                           1.0   \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9                           1.0   \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9                           1.0   \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9                           1.0   \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee                           1.0   \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee                           1.0   \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee                           1.0   \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05                           1.0   \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05                           1.0   \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05                           1.0   \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a                           1.0   \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a                           1.0   \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a                           1.0   \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1                           1.0   \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1                           1.0   \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1                           1.0   \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b                           1.0   \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b                           0.0   \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b                           1.0   \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb                           1.0   \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb                           1.0   \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb                           1.0   \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7                           1.0   \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7                           1.0   \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7                           1.0   \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb                           1.0   \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb                           1.0   \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb                           1.0   \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f                           1.0   \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f                           1.0   \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f                           1.0   \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d                           1.0   \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d                           1.0   \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d                           1.0   \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2                           1.0   \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2                           0.0   \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2                           1.0   \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49                           1.0   \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49                           1.0   \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49                           0.0   \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e                           1.0   \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e                           1.0   \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e                           1.0   \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb                           1.0   \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb                           1.0   \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb                           1.0   \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d                           1.0   \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d                           1.0   \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d                           1.0   \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a                           1.0   \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a                           1.0   \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a                           1.0   \n",
-       "\n",
-       "                                      num_expected_steps  num_actual_steps  \n",
-       "example_id                                                                  \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                   1                15  \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                   1                 1  \n",
-       "89bb564a-ddee-4a36-8a3d-d093eef415ca                   1                 3  \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67                   2                 2  \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67                   2                 2  \n",
-       "5b40cb96-ae09-438e-b940-d24445bb5d67                   2                 2  \n",
-       "288d6483-c618-4e34-9b86-275b490e0975                   3                 3  \n",
-       "288d6483-c618-4e34-9b86-275b490e0975                   3                 3  \n",
-       "288d6483-c618-4e34-9b86-275b490e0975                   3                 3  \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9                   4                 4  \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9                   4                 4  \n",
-       "915bd4b5-a536-4849-8cb6-8a658407c2c9                   4                 4  \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee                   3                 3  \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee                   3                 3  \n",
-       "1cb7a14d-cc7d-44f1-ab47-394f8221abee                   3                 3  \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05                   6                 6  \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05                   6                 7  \n",
-       "5b409366-ee6a-4bdb-b842-5e71d3407a05                   6                 6  \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a                  13                13  \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a                  13                13  \n",
-       "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a                  13                13  \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1                   8                 8  \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1                   8                 9  \n",
-       "5cf28d08-a49f-4a69-8759-b1b774ef74b1                   8                 8  \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b                  10                10  \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b                  10                 7  \n",
-       "9017ddcc-d3bd-45a8-88dd-70906964586b                  10                10  \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb                   3                 3  \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb                   3                 3  \n",
-       "b1ac4715-a0ad-48f2-8741-949ca23b39eb                   3                 3  \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7                   4                 4  \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7                   4                 4  \n",
-       "10d42048-ac73-414f-9f50-dba79c3b74a7                   4                 4  \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb                   4                 4  \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb                   4                 4  \n",
-       "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb                   4                 4  \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f                   5                 5  \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f                   5                 5  \n",
-       "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f                   5                 5  \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d                   5                 5  \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d                   5                 5  \n",
-       "5daad87c-a008-49ab-841c-76916b150f4d                   5                 5  \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2                  11                11  \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2                  11                12  \n",
-       "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2                  11                11  \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49                   8                 8  \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49                   8                 8  \n",
-       "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49                   8                 7  \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e                   6                 6  \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e                   6                 7  \n",
-       "a4ffa874-b03b-40ed-b360-d17c963ef27e                   6                 6  \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb                   7                 7  \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb                   7                 7  \n",
-       "8a2b5450-dd16-4213-8b70-cb2583d6c7eb                   7                 7  \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d                   7                 7  \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d                   7                 7  \n",
-       "223f250b-9c33-4aed-adfd-791547b44d3d                   7                 7  \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a                  10                10  \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a                  10                10  \n",
-       "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a                  10                10  "
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[columns].sort_values(by=[\"input.question\", \"model\"])"
-   ]
+   "execution_count": null,
+   "id": "87055296-62e1-4fa9-8868-5c213f4ea2e6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/docs/source/notebooks/tool_usage/typewriter_26.ipynb b/docs/source/notebooks/tool_usage/typewriter_26.ipynb
index cb90bf2..37f3ea8 100644
--- a/docs/source/notebooks/tool_usage/typewriter_26.ipynb
+++ b/docs/source/notebooks/tool_usage/typewriter_26.ipynb
@@ -71,7 +71,7 @@
        "</table>"
       ],
       "text/plain": [
-       "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x7f1b23b13240>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
+       "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x75aa9dec2d40>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
       ]
      },
      "execution_count": 2,
@@ -106,7 +106,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "61535a75-24f6-4727-9549-f76c263e9153",
    "metadata": {
     "tags": []
@@ -118,7 +118,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "f35a0a1d-5a1e-4de1-8d8c-c7c9a264a6c7",
    "metadata": {
     "tags": []
@@ -127,14 +127,14 @@
     {
      "data": {
       "text/plain": [
-       "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=<class 'pydantic.v1.main.aSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f6cd20e6520>),\n",
-       " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=<class 'pydantic.v1.main.bSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f6cd20e65c0>),\n",
-       " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=<class 'pydantic.v1.main.cSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f6cd20e6660>),\n",
-       " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=<class 'pydantic.v1.main.dSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f6cd20e6700>),\n",
-       " StructuredTool(name='e', description='e() -> str - Run to Type the letter \"e\".', args_schema=<class 'pydantic.v1.main.eSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f6cd20e67a0>)]"
+       "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=<class 'pydantic.v1.main.aSchema'>, func=<function _create_typing_func.<locals>.func at 0x75aa9defc180>),\n",
+       " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=<class 'pydantic.v1.main.bSchema'>, func=<function _create_typing_func.<locals>.func at 0x75aa9defc220>),\n",
+       " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=<class 'pydantic.v1.main.cSchema'>, func=<function _create_typing_func.<locals>.func at 0x75aa9defc2c0>),\n",
+       " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=<class 'pydantic.v1.main.dSchema'>, func=<function _create_typing_func.<locals>.func at 0x75aa9defc360>),\n",
+       " StructuredTool(name='e', description='e() -> str - Run to Type the letter \"e\".', args_schema=<class 'pydantic.v1.main.eSchema'>, func=<function _create_typing_func.<locals>.func at 0x75aa9defc400>)]"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -145,7 +145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "id": "5bea0190-39ec-4f30-9a00-90136bc6bf0b",
    "metadata": {
     "tags": []
@@ -157,7 +157,7 @@
        "'OK'"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -168,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "id": "bf7444da-15a1-455a-b22e-639cbfff8432",
    "metadata": {
     "tags": []
@@ -180,7 +180,7 @@
        "'OK'"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -191,7 +191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "id": "d12bd710-5c01-4539-a4b9-afbf03164923",
    "metadata": {
     "tags": []
@@ -203,7 +203,7 @@
        "'ad'"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -217,105 +217,110 @@
    "id": "f1d62a13-3771-460f-b131-4443f669ca3d",
    "metadata": {},
    "source": [
-    "## Agent Factory\n",
+    "## Explore the task\n",
     "\n",
     "For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n",
     "\n",
-    "We'll use an `OpenAIAgentFactory` provided with LangChain Benchmarks -- look at the `intro` section to see how to define your own."
+    "We'll use the `StandardAgentFactory` -- look at the `intro` for more information about what it does and/or how to create a custom one."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "6142cf4e-862c-47a3-aa75-81d7d3231308",
    "metadata": {
     "tags": []
    },
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_openai.chat_models import ChatOpenAI\n",
+    "\n",
+    "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n",
+    "\n",
+    "model = ChatOpenAI(temperature=0)\n",
+    "prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\"system\", \"{instructions}\"),  # Populated from task.instructions automatically\n",
+    "        (\"human\", \"{question}\"),  # Populated from the test data\n",
+    "        (\n",
+    "            \"placeholder\",\n",
+    "            \"{agent_scratchpad}\",\n",
+    "        ),  # Work where the agent can do its work (e.g., call multiple tools)\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "agent_factory = StandardAgentFactory(task, model, prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
+   "metadata": {
+    "tags": []
+   },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
+      "\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `a` with `{}`\n",
+      "\n",
+      "\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `b` with `{}`\n",
+      "\n",
+      "\n",
+      "\u001b[0m\u001b[33;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `c` with `{}`\n",
+      "\n",
+      "\n",
+      "\u001b[0m\u001b[38;5;200m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3mabcabcabc\u001b[0m\n",
+      "\n",
+      "\u001b[1m> Finished chain.\u001b[0m\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
-       "{'input': 'hello',\n",
-       " 'output': 'hello\\nhello',\n",
-       " 'intermediate_steps': [(AgentActionMessageLog(tool='h', tool_input={}, log='\\nInvoking: `h` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'h', 'arguments': ''}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='e', tool_input={}, log='\\nInvoking: `e` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'e', 'arguments': ''}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='l', tool_input={}, log='\\nInvoking: `l` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'l', 'arguments': ''}})]),\n",
+       "{'question': 'abc',\n",
+       " 'output': 'abcabcabc',\n",
+       " 'intermediate_steps': [(ToolAgentAction(tool='a', tool_input={}, log='\\nInvoking: `a` with `{}`\\n\\n\\n', message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'function': {'arguments': '{}', 'name': 'a'}, 'type': 'function'}, {'index': 1, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'function': {'arguments': '{}', 'name': 'b'}, 'type': 'function'}, {'index': 2, 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'function': {'arguments': '{}', 'name': 'c'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9a1af767-29e4-4759-ab28-5b29236e8f22', tool_calls=[{'name': 'a', 'args': {}, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI'}, {'name': 'b', 'args': {}, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW'}, {'name': 'c', 'args': {}, 'id': 'call_MRAOAgbi8vT445clqC8OybMR'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'index': 2}])], tool_call_id='call_OrpjShN5uNzw2Rsb1tWF6swI'),\n",
        "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='l', tool_input={}, log='\\nInvoking: `l` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'l', 'arguments': ''}})]),\n",
+       "  (ToolAgentAction(tool='b', tool_input={}, log='\\nInvoking: `b` with `{}`\\n\\n\\n', message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'function': {'arguments': '{}', 'name': 'a'}, 'type': 'function'}, {'index': 1, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'function': {'arguments': '{}', 'name': 'b'}, 'type': 'function'}, {'index': 2, 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'function': {'arguments': '{}', 'name': 'c'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9a1af767-29e4-4759-ab28-5b29236e8f22', tool_calls=[{'name': 'a', 'args': {}, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI'}, {'name': 'b', 'args': {}, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW'}, {'name': 'c', 'args': {}, 'id': 'call_MRAOAgbi8vT445clqC8OybMR'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'index': 2}])], tool_call_id='call_2XO5RNgt9FjGvTXztgD0tKqW'),\n",
        "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='o', tool_input={}, log='\\nInvoking: `o` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'o', 'arguments': ''}})]),\n",
+       "  (ToolAgentAction(tool='c', tool_input={}, log='\\nInvoking: `c` with `{}`\\n\\n\\n', message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'function': {'arguments': '{}', 'name': 'a'}, 'type': 'function'}, {'index': 1, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'function': {'arguments': '{}', 'name': 'b'}, 'type': 'function'}, {'index': 2, 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'function': {'arguments': '{}', 'name': 'c'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9a1af767-29e4-4759-ab28-5b29236e8f22', tool_calls=[{'name': 'a', 'args': {}, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI'}, {'name': 'b', 'args': {}, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW'}, {'name': 'c', 'args': {}, 'id': 'call_MRAOAgbi8vT445clqC8OybMR'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'index': 2}])], tool_call_id='call_MRAOAgbi8vT445clqC8OybMR'),\n",
        "   'OK')],\n",
-       " 'state': 'hello'}"
+       " 'state': 'abc'}"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "from langchain_benchmarks.tool_usage import agents\n",
+    "from langchain import globals\n",
     "\n",
-    "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n",
+    "globals.set_verbose(True)\n",
     "\n",
-    "# Let's test that our agent works\n",
     "agent = agent_factory()\n",
-    "agent.invoke({\"question\": \"hello\"})"
+    "agent.invoke({\"question\": \"abc\"})"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
+   "id": "89124d06-41f7-4432-9f2e-542c0d85e2e5",
    "metadata": {},
    "source": [
-    "## Eval\n",
-    "\n",
-    "Let's evaluate an agent now."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import uuid\n",
-    "\n",
-    "from langsmith.client import Client\n",
-    "\n",
-    "from langchain_benchmarks import clone_public_dataset\n",
-    "\n",
-    "# Clone the dataset\n",
-    "clone_public_dataset(task.dataset_id, dataset_name=task.name)\n",
-    "\n",
-    "experiment_uuid = uuid.uuid4().hex[:4]\n",
-    "\n",
-    "client = Client()\n",
-    "\n",
-    "models = [\"gpt-3.5-turbo-16k\"]\n",
+    "## Benchmarking\n",
     "\n",
-    "for model in models:\n",
-    "    print()\n",
-    "    agent_factory = agents.OpenAIAgentFactory(task, model=model)\n",
-    "    test_run = client.run_on_dataset(\n",
-    "        dataset_name=task.name,\n",
-    "        llm_or_chain_factory=agent_factory,\n",
-    "        evaluation=task.get_eval_config(),\n",
-    "        verbose=False,\n",
-    "        concurrency_level=1,\n",
-    "        project_name=f\"typewriter-26-{model}-{experiment_uuid}\",\n",
-    "        tags=[model],\n",
-    "        project_metadata={\n",
-    "            \"model\": model,\n",
-    "            \"arch\": \"openai-functions-agent\",\n",
-    "            \"id\": experiment_uuid,\n",
-    "        },\n",
-    "    )"
+    "See `introduction` and `benchmark all` for information on how to run benchmarks. This notebook is just to here to explain and explore the task."
    ]
   }
  ],
diff --git a/langchain_benchmarks/tool_usage/agents/adapters.py b/langchain_benchmarks/tool_usage/agents/adapters.py
index be060f8..b5ecbe4 100644
--- a/langchain_benchmarks/tool_usage/agents/adapters.py
+++ b/langchain_benchmarks/tool_usage/agents/adapters.py
@@ -41,27 +41,8 @@ def _read_state(*args: Any, **kwargs: Any) -> Any:
         else:
             return None
 
-    def _format_input(inputs: dict) -> dict:
-        """Make sure that the input is always called `input`."""
-
-        if "question" not in inputs:
-            raise ValueError(
-                "Expected 'question' to be in the inputs. Found only the following "
-                f"keys {sorted(inputs.keys())}."
-            )
-
-        inputs = inputs.copy()  # Because 'question' is popped below
-
-        if "input" not in inputs:
-            return {"input": inputs.pop("question"), **inputs}
-        return inputs
-
-    runnable = (
-        RunnableLambda(_format_input).with_config({"run_name": "Format Input"})
-        | agent_executor
-        | RunnableLambda(_ensure_output_exists).with_config(
-            {"run_name": "Ensure Output"}
-        )
+    runnable = agent_executor | RunnableLambda(_ensure_output_exists).with_config(
+        {"run_name": "Ensure Output"}
     )
 
     if state_reader is not None:

Name	Tool Usage - Typewriter (26 tools)
Type	ToolUsageTask
Dataset ID	128af05e-aa00-4e3b-a958-d166dd450581
Description	Environment with 26 tools each tool represents a letter of the alphabet.\n", + "\n", + "The objective of this task is to evaluate the model's ability the use tools\n", + "for a simple repetition task.\n", + "\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n", + "\n", + "This is a variation of the typer writer task, where 26 parameterless tools are\n", + "given instead of a single tool that takes a letter as an argument.
	feedback.correctness	feedback.Intermediate steps correctness	execution_time	feedback.# steps / # expected steps	# correct	n
model
gpt-3.5-turbo-0613	0.8	0.8	7.992928	1.03333	8.0	10
gpt-3.5-turbo-1106	0.6	0.6	8.933172	0.93332	6.0	10
gpt-4-0613	0.5	0.6	8.329558	0.76666	5.0	10
	input.question	model	actual_steps	reference.expected_steps	outputs.output	reference.reference	feedback.correctness	num_expected_steps
example_id
20ea2f0e-b306-474a-8daa-f4386cc16599	Add 2 and 3	gpt-3.5-turbo-0613	[add]	[add]	The sum of 2 and 3 in this alternate mathemati...	6.20	1.0	1
20ea2f0e-b306-474a-8daa-f4386cc16599	Add 2 and 3	gpt-3.5-turbo-1106	[add]	[add]	The result of adding 2 and 3 is 6.2.	6.20	1.0	1
20ea2f0e-b306-474a-8daa-f4386cc16599	Add 2 and 3	gpt-4-0613	[add]	[add]	6.2	6.20	1.0	1
2d3e1665-7b3f-4013-b010-6af30ed62ab2	I ate 1 apple and 2 oranges every day for 7 da...	gpt-3.5-turbo-0613	[add, multiply]	[add, multiply]	You ate a total of 32.34 fruits.	32.34	1.0	2
2d3e1665-7b3f-4013-b010-6af30ed62ab2	I ate 1 apple and 2 oranges every day for 7 da...	gpt-3.5-turbo-1106	[add]	[add, multiply]	You ate 16.2 fruits.	32.34	0.0	2
	feedback.correctness	feedback.Intermediate steps correctness	execution_time	feedback.# steps / # expected steps	n
model
gpt-3.5-turbo-0613	0.714286	0.714286	4.829506	0.825390	21
gpt-3.5-turbo-1106	0.857143	0.714286	5.464218	0.965871	21
gpt-4-0613	0.952381	0.714286	8.544358	1.037300	21
	feedback.Correct Final State	feedback.Intermediate steps correctness	execution_time	feedback.# steps / # expected steps	# correct	n
model
gpt-3.5-turbo-0613	0.95	0.95	18.880388	1.700000	19.0	20
gpt-3.5-turbo-1106	0.90	0.75	22.471857	1.012455	18.0	20
gpt-4-0613	0.90	0.90	22.663781	1.093750	18.0	20
	input.question	model	outputs.state	reference.state	feedback.Correct Final State	num_expected_steps	num_actual_steps
example_id
89bb564a-ddee-4a36-8a3d-d093eef415ca	a	gpt-3.5-turbo-0613	aaaaaaaaaaaaaaa	a	0.0	1	15
89bb564a-ddee-4a36-8a3d-d093eef415ca	a	gpt-3.5-turbo-1106	a	a	1.0	1	1
89bb564a-ddee-4a36-8a3d-d093eef415ca	a	gpt-4-0613	abc	a	0.0	1	3
5b40cb96-ae09-438e-b940-d24445bb5d67	aa	gpt-3.5-turbo-0613	aa	aa	1.0	2	2
5b40cb96-ae09-438e-b940-d24445bb5d67	aa	gpt-3.5-turbo-1106	aa	aa	1.0	2	2