diff --git a/docs/source/notebooks/tool_usage/intro.ipynb b/docs/source/notebooks/tool_usage/intro.ipynb index 06d89d9..b228b8b 100644 --- a/docs/source/notebooks/tool_usage/intro.ipynb +++ b/docs/source/notebooks/tool_usage/intro.ipynb @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "3b9b82fc-b689-4a25-b718-99ecc2fc6867", "metadata": { "tags": [] @@ -136,19 +136,21 @@ "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n", "\n", "Success is measured by the ability to answer the question correctly, and efficiently. \n", - "Multiverse Math ToolUsageTask594f9f60-30a0-49bf-b075-f44beabf546aAn environment that contains a few basic math operations, but with altered results.\n", + "Multiverse Math ToolUsageTask47ed57bc-e852-4f84-a23e-cce4793864e9An environment that contains a few basic math operations, but with altered results.\n", "\n", "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n", "\n", - "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math. \n", + "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\n", + "\n", + "This task is associated with 20 test examples. \n", "\n", "" ], "text/plain": [ - "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\", eval_params={}), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math'})])" + "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\", eval_params={}), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n\\nThis task is associated with 20 test examples.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math_without_question'})])" ] }, - "execution_count": 2, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -169,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "7543739b-d212-4249-9b4a-fc406a58c9c7", "metadata": { "tags": [] @@ -198,10 +200,10 @@ "" ], "text/plain": [ - "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})" + "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -248,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "f201dbbe-7d92-4bc7-b4b5-ea8901dd2970", "metadata": { "tags": [] @@ -257,13 +259,13 @@ { "data": { "text/plain": [ - "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=, func=.func at 0x1277c18a0>),\n", - " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=, func=.func at 0x1277c13a0>),\n", - " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=, func=.func at 0x1277c19e0>),\n", - " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=, func=.func at 0x1277c1800>)]" + "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=, func=.func at 0x7b3a9f62c9a0>),\n", + " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=, func=.func at 0x7b3a9f62c5e0>),\n", + " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=, func=.func at 0x7b3a9f62cae0>),\n", + " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=, func=.func at 0x7b3a9f62cb80>)]" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -275,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "b07957ee-ae52-47d4-a4ff-aa99d4d9bdaf", "metadata": { "tags": [] @@ -287,7 +289,7 @@ "'OK'" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -300,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "40fbb9b6-00f6-4445-b480-00eed6b5b3aa", "metadata": { "tags": [] @@ -312,7 +314,7 @@ "'aac'" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -326,134 +328,118 @@ "id": "8d39b9b3-d4da-49bc-b3db-8a4165b1db55", "metadata": {}, "source": [ - "## Creating an agent\n", + "## Create an Agent!\n", "\n", - "So now that you know how the test environment works, it's time to define an agent! \n", + "Now that you know how the test environment works, let's create an agent that we can test!\n", "\n", - "We will follow the example in the LangChain documentation to [define an OpenAI tool using agent](https://python.langchain.com/docs/modules/agents/). " + "Because an agent interacts with the environment via tools and can change the state of the environment during the course of an agent run, what we actually want is the ability to create a fresh agent and a fresh environment for each test run.\n", + "\n", + "We'll do this using a factory. A factory is just a fancy name in computer science for an object that can create other objects. In this case, we'll have an Agent Factory that we can call and it'll create a fresh agent for us on each call.\n", + "\n", + "We'll use the StandardAgentFactory which under the hood creates a standard LangChain [tool calling agent](https://python.langchain.com/docs/modules/agents/agent_types/tool_calling/). It can be used with any [Chat Model that support tool calling](https://python.langchain.com/docs/integrations/chat/)." ] }, { "cell_type": "code", - "execution_count": 26, - "id": "8827186a-8ed3-43c7-956c-71342e0a7bf2", - "metadata": { - "tags": [] - }, + "execution_count": 7, + "id": "db65c253-7710-4c7b-b968-0662ec089030", + "metadata": {}, "outputs": [], "source": [ - "from langchain.agents.format_scratchpad.openai_tools import (\n", - " format_to_openai_tool_messages,\n", - ")\n", - "from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser\n", - "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n", - "from langchain.tools.render import (\n", - " format_tool_to_openai_function,\n", - " format_tool_to_openai_tool,\n", - ")\n", - "from langchain_community.chat_models import ChatOpenAI\n", - "from langchain_core.runnables import RunnableParallel\n", - "\n", - "tools = task.create_environment().tools\n", - "formatted_tools = [format_tool_to_openai_tool(t) for t in tools]\n", - "llm = ChatOpenAI(model=\"gpt-3.5-turbo-1106\", temperature=0, model_kwargs={\"seed\": 42})\n", - "# Compose the llm call with the tools' JSON schemas\n", - "llm_with_tools = llm.bind(tools=formatted_tools)\n", - "format_inputs = RunnableParallel(\n", - " {\n", - " \"input\": lambda x: x[\"input\"],\n", - " \"agent_scratchpad\": lambda x: format_to_openai_tool_messages(\n", - " x[\"intermediate_steps\"]\n", - " ),\n", - " }\n", - ")\n", + "from langchain_anthropic.chat_models import ChatAnthropic\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "\n", + "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n", "\n", + "model = ChatAnthropic(model=\"claude-3-opus-20240229\", temperature=0)\n", "prompt = ChatPromptTemplate.from_messages(\n", " [\n", + " (\"system\", \"{instructions}\"), # Populated from task.instructions automatically\n", " (\n", - " \"system\",\n", - " \"You are very powerful assistant, but bad at calculating lengths of words.\",\n", - " ),\n", - " (\"user\", \"{input}\"),\n", - " MessagesPlaceholder(variable_name=\"agent_scratchpad\"),\n", + " \"human\",\n", + " \"{question}\",\n", + " ), # Each evaluation example is associated with a question\n", + " (\"placeholder\", \"{agent_scratchpad}\"), # Space for the agent to do work\n", " ]\n", ")\n", - "agent_definition = (\n", - " # Input to this pipeline is a dictionary with \"input\" and \"intermediate_steps\" keys\n", - " format_inputs | prompt | llm_with_tools | OpenAIToolsAgentOutputParser()\n", - ")" + "\n", + "agent_factory = StandardAgentFactory(task, model, prompt)" ] }, { "cell_type": "markdown", - "id": "7614ab73-dc66-4f2e-9eeb-ff1711c113d0", + "id": "5c99a9bd-fa3e-4401-9062-77dbcff30d5c", "metadata": {}, "source": [ - "### Agent Factory\n", - "\n", - "As discussed above, each test environment tracks state. We want to create a new environment for each data point to avoid cross-contamination between rows in the dataset.\n", - "\n", - "We do this by defining an agent factory. Below, we integrate our agent into a `CustomRunnableAgentFactory`, which helps create the environment and agent executor for each data point." + "Here, were the instructions for the task" ] }, { "cell_type": "code", - "execution_count": 27, - "id": "629416b3-b5d6-45ad-9bda-4f0642a0eb13", - "metadata": { - "tags": [] - }, - "outputs": [], + "execution_count": 9, + "id": "8e1f0a3d-fed6-41f7-8825-08787a57ad98", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\"" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from langchain_benchmarks.tool_usage.agents import CustomRunnableAgentFactory\n", - "\n", - "agent_factory = CustomRunnableAgentFactory(task, agent=agent_definition)" + "task.instructions" ] }, { "cell_type": "markdown", - "id": "7f06cf25-6766-4ea5-a566-36af045bdcf4", + "id": "82c9de5d-185b-4776-9ee9-112a2db32139", "metadata": {}, "source": [ - "Let's check that the agent works" + "Let's test it out" ] }, { "cell_type": "code", - "execution_count": 28, - "id": "755f7920-831b-4595-8c6d-cca22c935198", - "metadata": { - "tags": [] - }, - "outputs": [], + "execution_count": 10, + "id": "ce67d619-fa99-4c15-bc53-3fb08b40a201", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m\n", + "Invoking: `a` with `{}`\n", + "responded: [{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_01MQ6oTx2j2uNGCR5LBVeKui', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01AytT1jvNNR67VodMkhbq7r', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_015VkTYUV5hWcobtduqssi9k', 'input': {}, 'name': 'c', 'type': 'tool_use'}]\n", + "\n", + "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `b` with `{}`\n", + "responded: [{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_01MQ6oTx2j2uNGCR5LBVeKui', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01AytT1jvNNR67VodMkhbq7r', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_015VkTYUV5hWcobtduqssi9k', 'input': {}, 'name': 'c', 'type': 'tool_use'}]\n", + "\n", + "\u001b[0m\u001b[33;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `c` with `{}`\n", + "responded: [{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_01MQ6oTx2j2uNGCR5LBVeKui', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01AytT1jvNNR67VodMkhbq7r', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_015VkTYUV5hWcobtduqssi9k', 'input': {}, 'name': 'c', 'type': 'tool_use'}]\n", + "\n", + "\u001b[0m\u001b[38;5;200m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m[]\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + } + ], "source": [ "from langchain import globals\n", "\n", - "globals.set_verbose(True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2804eae-5b0b-4a38-9dff-363a4fe8f324", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "globals.set_verbose(True)\n", "agent = agent_factory()\n", - "agent.invoke({\"question\": \"abc\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "2aa68a11-d268-4868-a862-309801201989", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + "agent.invoke({\"question\": \"abc\"})\n", "globals.set_verbose(False)" ] }, @@ -485,12 +471,12 @@ "id": "5e9e5817-3b9d-4a1e-8ee8-692d39aa68ca", "metadata": {}, "source": [ - "This evaluator will be used below when we benchmark on all tasks!" + "Each task is associated with its own task specific evaluator!" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "id": "c88bd6e1-f77e-4668-a143-096929e897ee", "metadata": { "tags": [] @@ -499,10 +485,10 @@ { "data": { "text/plain": [ - "RunEvalConfig(evaluators=[], custom_evaluators=[], reference_key=None, prediction_key=None, input_key=None, eval_llm=None)" + "RunEvalConfig(evaluators=[], custom_evaluators=[], batch_evaluators=None, reference_key=None, prediction_key=None, input_key=None, eval_llm=None)" ] }, - "execution_count": 13, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -522,14 +508,13 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "60466447-eb37-4204-a497-fe47e8d8dd70", - "metadata": { - "tags": [] - }, + "execution_count": 11, + "id": "0770b442-f96a-4670-a4f7-3093f24fb64b", + "metadata": {}, "outputs": [], "source": [ "import datetime\n", + "import uuid\n", "\n", "from langsmith.client import Client\n", "\n", @@ -539,110 +524,89 @@ " model_registry,\n", " registry,\n", ")\n", - "from langchain_benchmarks.rate_limiting import RateLimiter\n", - "from langchain_benchmarks.tool_usage.agents import (\n", - " AnthropicToolUserFactory,\n", - " CustomAgentFactory,\n", - " CustomRunnableAgentFactory,\n", - " OpenAIAgentFactory,\n", - " OpenAIAssistantFactory,\n", - ")" + "from langchain_benchmarks.rate_limiting import RateLimiter" ] }, { - "cell_type": "code", - "execution_count": 15, - "id": "c448d139-9923-4cf6-af49-cbf3dff46bdc", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "15cbded4-5ab5-4b9b-9e88-77b24d3b750c", + "metadata": {}, "source": [ - "import uuid\n", - "\n", - "experiment_uuid = uuid.uuid4().hex[:]" + "Create an experiment ID. we'll use it to tag our runs, which we can later use to retrieve run data from LangSmith." ] }, { "cell_type": "code", - "execution_count": 16, - "id": "200df769-4dd9-453b-8500-219c1d5305f6", - "metadata": { - "tags": [] - }, + "execution_count": 12, + "id": "c23208e3-01d1-4e83-9e4a-59544828f6f5", + "metadata": {}, "outputs": [], "source": [ - "tests = [\n", - " # 2-tuple of (architecture, model name)\n", - " (\"openai_functions\", \"gpt-3.5-turbo-1106\"), # Requires OpenAI Creds\n", - " (\"openai_functions\", \"gpt-3.5-turbo-0613\"),\n", - " (\"openai_functions\", \"gpt-4-1106-preview\"),\n", - " (\"openai_functions\", \"gpt-4-0613\"),\n", - " (\"openai_functions\", \"mistral-7b-instruct-v0.1\"), # Requires AnyScale creds\n", - " # Requires Anthropic Creds and Setting up Anthropics Tool Usage package.\n", - " # (\n", - " # \"anthropic_tool_user\",\n", - " # \"claude-2.1\",\n", - " # ),\n", - "]" + "experiment_id = uuid.uuid4().hex[:]" + ] + }, + { + "cell_type": "markdown", + "id": "83050cfc-f50f-4c63-8257-07e7688a54c4", + "metadata": {}, + "source": [ + "Run evaluation against all tasks." ] }, { "cell_type": "code", "execution_count": null, - "id": "5ddf7355-7db9-4adc-bc1e-f04c3d0ec57d", - "metadata": { - "tags": [] - }, + "id": "b2a3463b-1c9f-494b-bcbd-1dc1760ebf19", + "metadata": {}, "outputs": [], "source": [ "client = Client() # Launch langsmith client for cloning datasets\n", "today = datetime.date.today().isoformat()\n", - "rate_limiter = RateLimiter(requests_per_second=2)\n", "\n", - "for task in registry:\n", + "# You can use an optional rate limiter to rate limit your requests!\n", + "rate_limiter = RateLimiter(requests_per_second=1)\n", + "\n", + "\n", + "# Set up 2-tuples of (model name, model instance)\n", + "# You can update this list with any model that supports tool calling.\n", + "# See list here: https://python.langchain.com/docs/integrations/chat/\n", + "tests = [\n", + " (\n", + " \"claude-3-haiku-20240307\",\n", + " ChatAnthropic(model=\"claude-3-haiku-20240307\", temperature=0),\n", + " )\n", + "]\n", + "\n", + "\n", + "for task in registry.tasks:\n", " if task.type != \"ToolUsageTask\":\n", " continue\n", "\n", - " dataset_name = task.name\n", + " dataset_name = task.name + f\" ({today})\"\n", " clone_public_dataset(task.dataset_id, dataset_name=dataset_name)\n", "\n", - " for arch, model in tests:\n", + " for model_name, model in tests:\n", " print()\n", - " print(f\"Benchmarking {task.name} with model: {model} and arch: {arch}\")\n", + " print(f\"Benchmarking {task.name} with model: {model_name}\")\n", " eval_config = task.get_eval_config()\n", "\n", - " if arch == \"openai_functions\":\n", - " agent_factory = OpenAIAgentFactory(\n", - " task, model=model, rate_limiter=rate_limiter\n", - " )\n", - " elif arch == \"custom_agent\":\n", - " agent_factory = CustomAgentFactory(\n", - " task, model=model, rate_limiter=rate_limiter\n", - " )\n", - " elif arch == \"custom_runnable_agent\":\n", - " # For this, the model would have to be a runnable object\n", - " agent_factory = CustomRunnableAgentFactory(task, agent=model)\n", - " elif arch == \"anthropic_tool_user\":\n", - " agent_factory = AnthropicToolUserFactory(task)\n", - " else:\n", - " raise ValueError()\n", + " agent_factory = StandardAgentFactory(\n", + " task, model, prompt, rate_limiter=rate_limiter\n", + " )\n", "\n", " client.run_on_dataset(\n", " dataset_name=dataset_name,\n", " llm_or_chain_factory=agent_factory,\n", " evaluation=eval_config,\n", " verbose=False,\n", - " project_name=f\"{model}-{task.name}-{today}-{experiment_uuid}\",\n", - " tags=[model],\n", + " project_name=f\"{model_name}-{task.name}-{today}-{experiment_id}\",\n", " concurrency_level=5,\n", " project_metadata={\n", - " \"model\": model,\n", + " \"model\": model_name,\n", " \"id\": experiment_uuid,\n", " \"task\": task.name,\n", " \"date\": today,\n", " \"langchain_benchmarks_version\": __version__,\n", - " \"arch\": arch,\n", " },\n", " )" ] @@ -656,6 +620,8 @@ "\n", "The following sections demonstrate slightly more \"advanced\" usage if you want to completely customize the agent runtime in a way that is compatible with our test runner.\n", "\n", + "We'll also apply an adapter to the agent which will will capture its inputs and outputs (e.g, add information the agent's environment at the end of the run) so that it we can evaluate it.\n", + "\n", "### Custom Agent Factory\n", "\n", "If you want even more configurability beyond what the `CustomRunnableAgentFactory` provides, you can create your owne `AgentFactory` using the following pattern.\n", @@ -666,33 +632,33 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "bca8ad69-9956-451c-b639-ea30c77d982f", - "metadata": { - "tags": [] - }, + "execution_count": 16, + "id": "69351864-2e97-43df-81ae-5067cbf5e471", + "metadata": {}, "outputs": [], "source": [ - "from langchain.agents import AgentType, initialize_agent\n", - "from langchain.chat_models import ChatOpenAI\n", + "from typing import Optional\n", + "\n", + "from langchain.agents import AgentExecutor, create_tool_calling_agent\n", + "from langchain_anthropic import ChatAnthropic\n", + "from langchain_core.prompts import ChatPromptTemplate\n", "\n", "from langchain_benchmarks.schema import ExtractionTask\n", - "from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "44839ebe-48ea-4d5b-87b4-2ad72acacb71", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "class AgentFactory:\n", - " def __init__(self, task: ExtractionTask, model: str) -> None:\n", + "from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter\n", + "\n", + "\n", + "class CustomAgentFactory:\n", + " def __init__(\n", + " self,\n", + " task: ExtractionTask,\n", + " *,\n", + " # It can be useful to add a rate-limiter\n", + " # which will limit ther number of requests per second\n", + " # when running evaluation.\n", + " rate_limiter: Optional[RateLimiter] = None,\n", + " ) -> None:\n", " self.task = task\n", - " self.model = model\n", + " self.rate_limiter = rate_limiter\n", "\n", " def __call__(self):\n", " # This factory creates a new environment for every agent run.\n", @@ -701,63 +667,127 @@ " # At the end of the run, the environment state will be read.\n", " env = task.create_environment() # Create a new environment for every agent run!\n", " tools = env.tools\n", - " llm = ChatOpenAI(temperature=0, model=self.model)\n", - " agent_executor = initialize_agent(\n", - " tools,\n", - " llm,\n", - " agent=AgentType.OPENAI_FUNCTIONS,\n", - " return_intermediate_steps=True,\n", + " model = ChatAnthropic(model=\"claude-3-opus-20240229\", temperature=0)\n", + " prompt = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", self.task.instructions),\n", + " (\n", + " \"human\",\n", + " \"{question}\",\n", + " ), # Populated from task.instructions automatically\n", + " (\"placeholder\", \"{agent_scratchpad}\"),\n", + " ]\n", + " )\n", + "\n", + " # This is the standard tool calling agent implementation\n", + " # Feel free to replace it with any other implementation you want!\n", + " # https://python.langchain.com/docs/modules/agents/how_to/custom_agent/\n", + " agent = create_tool_calling_agent(model, env.tools, prompt)\n", + "\n", + " if self.rate_limiter:\n", + " agent = with_rate_limit(agent, self.rate_limiter)\n", + "\n", + " executor = AgentExecutor(\n", + " agent=agent,\n", + " tools=env.tools,\n", " handle_parsing_errors=True,\n", + " return_intermediate_steps=True,\n", " )\n", + "\n", " # Apply the adapters so that inputs and outputs match dataset schema\n", " # state_reader automatically adds the state of the environment at the end of the run.\n", - " return apply_agent_executor_adapter(agent_executor, state_reader=env.read_state)" + " return apply_agent_executor_adapter(executor, state_reader=env.read_state)" ] }, { "cell_type": "code", - "execution_count": 24, - "id": "8b6108e4-c7cc-42e8-a23d-89c7b94fab6c", - "metadata": { - "tags": [] - }, + "execution_count": 17, + "id": "18a96a6f-812b-4b0e-83c5-d001bf50851e", + "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Name Tool Usage - Typewriter (26 tools)
Type ToolUsageTask
Dataset ID 128af05e-aa00-4e3b-a958-d166dd450581
DescriptionEnvironment with 26 tools each tool represents a letter of the alphabet.\n", + "\n", + "The objective of this task is to evaluate the model's ability the use tools\n", + "for a simple repetition task.\n", + "\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n", + "\n", + "This is a variation of the typer writer task, where 26 parameterless tools are\n", + "given instead of a single tool that takes a letter as an argument.
" + ], "text/plain": [ - "{'input': 'xypxy',\n", - " 'output': 'I have typed \"xypxy\" as you requested.',\n", - " 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'x'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'x'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"x\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'y'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'y'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"y\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'p'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'p'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"p\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'x'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'x'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"x\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'y'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'y'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"y\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK')],\n", - " 'state': 'xypxy'}" + "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})" ] }, - "execution_count": 24, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "agent_factory = AgentFactory(task, \"gpt-4\")\n", - "agent = agent_factory()\n", - "agent.invoke({\"question\": \"xypxy\"})" + "task" ] }, { "cell_type": "code", - "execution_count": null, - "id": "9bdf9328-0103-48d3-8dfc-933423db9796", + "execution_count": 18, + "id": "a7bd4af3-c0f1-4308-abbf-330d7497b3e3", + "metadata": {}, + "outputs": [], + "source": [ + "custom_agent_factory = CustomAgentFactory(task)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c5b69b7c-4294-47d1-85d7-47d718945898", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "agent = custom_agent_factory()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "1ac24ef5-d3ca-41aa-b888-7ebcd8a92ff4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'abc',\n", + " 'output': [],\n", + " 'intermediate_steps': [(ToolAgentAction(tool='a', tool_input={}, log='\\nInvoking: `a` with `{}`\\nresponded: [{\\'text\\': \\'\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_016f6CZwwFmdz2h8KbdGRVjj'),\n", + " 'OK'),\n", + " (ToolAgentAction(tool='b', tool_input={}, log='\\nInvoking: `b` with `{}`\\nresponded: [{\\'text\\': \\'\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_01JvfeTpU3hEuS7PknFk5a8S'),\n", + " 'OK'),\n", + " (ToolAgentAction(tool='c', tool_input={}, log='\\nInvoking: `c` with `{}`\\nresponded: [{\\'text\\': \\'\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_01NbBCY5Fg62RsyAAUd4n2g1'),\n", + " 'OK')],\n", + " 'state': 'abc'}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agent.invoke({\"question\": \"abc\"})" + ] } ], "metadata": { @@ -776,7 +806,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.2" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/docs/source/notebooks/tool_usage/multiverse_math.ipynb b/docs/source/notebooks/tool_usage/multiverse_math.ipynb index 44eb1b4..00b8cb7 100644 --- a/docs/source/notebooks/tool_usage/multiverse_math.ipynb +++ b/docs/source/notebooks/tool_usage/multiverse_math.ipynb @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", "metadata": { "tags": [] @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece", "metadata": { "tags": [] @@ -65,20 +65,22 @@ "\n", "Name Multiverse Math \n", "Type ToolUsageTask \n", - "Dataset ID 594f9f60-30a0-49bf-b075-f44beabf546a\n", + "Dataset ID 47ed57bc-e852-4f84-a23e-cce4793864e9\n", "DescriptionAn environment that contains a few basic math operations, but with altered results.\n", "\n", "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n", "\n", - "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math. \n", + "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\n", + "\n", + "This task is associated with 20 test examples. \n", "\n", "" ], "text/plain": [ - "ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math'})" + "ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n\\nThis task is associated with 20 test examples.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math_without_question'})" ] }, - "execution_count": 4, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -108,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "id": "e2439d0c-ccb9-4f5b-a127-548725025a98", "metadata": { "tags": [] @@ -117,14 +119,14 @@ { "data": { "text/plain": [ - "[StructuredTool(name='multiply', description='multiply(a: float, b: float) -> float - Multiply two numbers; a * b.', args_schema=, func=),\n", - " StructuredTool(name='add', description='add(a: float, b: float) -> float - Add two numbers; a + b.', args_schema=, func=),\n", - " StructuredTool(name='divide', description='divide(a: float, b: float) -> float - Divide two numbers; a / b.', args_schema=, func=),\n", - " StructuredTool(name='subtract', description='subtract(a: float, b: float) -> float - Subtract two numbers; a - b.', args_schema=, func=),\n", - " StructuredTool(name='power', description='power(a: float, b: float) -> float - Raise a number to a power; a ** b.', args_schema=, func=)]" + "[StructuredTool(name='multiply', description='multiply(a: float, b: float) -> float - Multiply two numbers; a * b.', args_schema=, func=),\n", + " StructuredTool(name='add', description='add(a: float, b: float) -> float - Add two numbers; a + b.', args_schema=, func=),\n", + " StructuredTool(name='divide', description='divide(a: float, b: float) -> float - Divide two numbers; a / b.', args_schema=, func=),\n", + " StructuredTool(name='subtract', description='subtract(a: float, b: float) -> float - Subtract two numbers; a - b.', args_schema=, func=),\n", + " StructuredTool(name='power', description='power(a: float, b: float) -> float - Raise a number to a power; a ** b.', args_schema=, func=)]" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -144,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "id": "f5a100bd-6e19-498f-8a36-393b5c19bcb9", "metadata": { "tags": [] @@ -156,7 +158,7 @@ "8.8" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -175,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "id": "31afb08b-17b8-4866-86c1-ee24e804415c", "metadata": { "tags": [] @@ -187,7 +189,7 @@ "'You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.'" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -210,1101 +212,86 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "6142cf4e-862c-47a3-aa75-81d7d3231308", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input': 'how much is 3 + 5',\n", - " 'output': '9.2',\n", - " 'intermediate_steps': [(AgentActionMessageLog(tool='add', tool_input={'a': 3, 'b': 5}, log=\"\\nInvoking: `add` with `{'a': 3, 'b': 5}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"a\": 3,\\n \"b\": 5\\n}', 'name': 'add'}})]),\n", - " 9.2)]}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "from langchain_benchmarks.tool_usage import agents\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_openai.chat_models import ChatOpenAI\n", "\n", - "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-4-0613\")\n", + "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n", "\n", - "# Let's test that our agent works\n", - "agent = agent_factory.create()\n", - "agent.invoke({\"question\": \"how much is 3 + 5\"})" - ] - }, - { - "cell_type": "markdown", - "id": "3821e4b0-8e67-418a-840c-470fcde42df0", - "metadata": {}, - "source": [ - "## Eval\n", + "model = ChatOpenAI(temperature=0)\n", + "prompt = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", \"{instructions}\"), # Populated from task.instructions automatically\n", + " (\"human\", \"{question}\"), # Populated from the test data\n", + " (\n", + " \"placeholder\",\n", + " \"{agent_scratchpad}\",\n", + " ), # Work where the agent can do its work (e.g., call multiple tools)\n", + " ]\n", + ")\n", "\n", - "Let's evaluate an agent now" + "agent_factory = StandardAgentFactory(task, model, prompt)" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432", - "metadata": { - "tags": [] - }, + "execution_count": 7, + "id": "11e4fff5-e184-45e1-a472-c0a9f70e897a", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", - "View the evaluation results for project 'multiverse-math-gpt-3.5-turbo-1106-d680' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/0919eab8-dca7-4049-a6eb-4067b9862eba?eval=true\n", "\n", - "View all tests for Dataset Multiverse Math at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n", - "[------------------------------------------------->] 10/10\n", - "View the evaluation results for project 'multiverse-math-gpt-3.5-turbo-0613-d680' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/126d8555-c69c-4ab7-ae95-2ad9bc41989e?eval=true\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m\n", + "Invoking: `add` with `{'a': 2, 'b': 5}`\n", + "\n", "\n", - "View all tests for Dataset Multiverse Math at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n", - "[------------------------------------------------->] 10/10\n", - "View the evaluation results for project 'multiverse-math-gpt-4-0613-d680' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/619eb2de-8920-41db-91a1-15ee407422de?eval=true\n", + "\u001b[0m\u001b[33;1m\u001b[1;3m8.2\u001b[0m\u001b[32;1m\u001b[1;3mThe result of 2 + 5 in this alternate mathematical universe is 8.2.\u001b[0m\n", "\n", - "View all tests for Dataset Multiverse Math at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0\n", - "[------------------------------------------------->] 10/10" + "\u001b[1m> Finished chain.\u001b[0m\n" ] - } - ], - "source": [ - "import uuid\n", - "\n", - "from langsmith.client import Client\n", - "\n", - "from langchain_benchmarks import clone_public_dataset\n", - "\n", - "clone_public_dataset(task.dataset_id, dataset_name=task.name)\n", - "experiment_uuid = uuid.uuid4().hex[:4]\n", - "\n", - "client = Client()\n", - "\n", - "models = [\"gpt-3.5-turbo-1106\", \"gpt-3.5-turbo-0613\", \"gpt-4-0613\"]\n", - "\n", - "for model in models:\n", - " print()\n", - " agent_factory = agents.OpenAIAgentFactory(task, model=model)\n", - " test_run = client.run_on_dataset(\n", - " dataset_name=task.name,\n", - " llm_or_chain_factory=agent_factory,\n", - " evaluation=task.get_eval_config(),\n", - " verbose=False,\n", - " project_name=f\"multiverse-math-{model}-{experiment_uuid}\",\n", - " tags=[model],\n", - " project_metadata={\n", - " \"model\": model,\n", - " \"arch\": \"openai-functions-agent\",\n", - " \"id\": experiment_uuid,\n", - " },\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd", - "metadata": {}, - "source": [ - "## Analyze\n", - "\n", - "You can take a look at the underlying results." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "78c4cc84-43c2-4084-a63b-dc10a5c01856", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from langsmith.client import Client\n", - "\n", - "client = Client()\n", - "projects = list(client.list_projects(reference_dataset_name=\"Multiverse Math\"))\n", - "\n", - "dfs = []\n", - "for project in projects:\n", - " first_root_run = next(\n", - " client.list_runs(project_name=project.name, execution_order=1)\n", - " )\n", - " # Temporary way to get tag information\n", - " tags = first_root_run.tags\n", - " test_results = client.get_test_results(project_name=project.name)\n", - " test_results[\"model\"] = tags[0]\n", - " dfs.append(test_results)\n", - "\n", - "\n", - "df = pd.concat(dfs)\n", - "\n", - "df[\"actual_steps\"] = df[\"outputs.intermediate_steps\"].apply(\n", - " lambda steps: [step[0][\"tool\"] for step in steps]\n", - ")\n", - "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)" - ] - }, - { - "cell_type": "markdown", - "id": "0ab0792e-e04a-400e-9726-5c123836f710", - "metadata": {}, - "source": [ - "### Stats\n", - "\n", - "This is a really small dataset so it's hard to tell whether there are substantial differences between the models; however, the agents are clearly not perfect here.\n", - "\n", - "The results are suggestive of the fact that it's more difficult for gpt-4 to ignore what it knows about math (which isn't surprising); e.g., in this universe the negative of -5 is still -5 (rather than 5).\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "53d85491-a5a0-4448-bccc-7171e03ffb21", - "metadata": { - "tags": [] - }, - "outputs": [ + }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
feedback.correctnessfeedback.Intermediate steps correctnessexecution_timefeedback.# steps / # expected steps# correctn
model
gpt-3.5-turbo-06130.80.87.9929281.033338.010
gpt-3.5-turbo-11060.60.68.9331720.933326.010
gpt-4-06130.50.68.3295580.766665.010
\n", - "
" - ], "text/plain": [ - " feedback.correctness \\\n", - "model \n", - "gpt-3.5-turbo-0613 0.8 \n", - "gpt-3.5-turbo-1106 0.6 \n", - "gpt-4-0613 0.5 \n", - "\n", - " feedback.Intermediate steps correctness execution_time \\\n", - "model \n", - "gpt-3.5-turbo-0613 0.8 7.992928 \n", - "gpt-3.5-turbo-1106 0.6 8.933172 \n", - "gpt-4-0613 0.6 8.329558 \n", - "\n", - " feedback.# steps / # expected steps # correct n \n", - "model \n", - "gpt-3.5-turbo-0613 1.03333 8.0 10 \n", - "gpt-3.5-turbo-1106 0.93332 6.0 10 \n", - "gpt-4-0613 0.76666 5.0 10 " + "{'question': 'how much is 2+5',\n", + " 'output': 'The result of 2 + 5 in this alternate mathematical universe is 8.2.',\n", + " 'intermediate_steps': [(ToolAgentAction(tool='add', tool_input={'a': 2, 'b': 5}, log=\"\\nInvoking: `add` with `{'a': 2, 'b': 5}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_MZMnEZrae7AuXYtWzH0l9xKL', 'function': {'arguments': '{\"a\":2,\"b\":5}', 'name': 'add'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-b7548303-194d-40ee-85bf-3d43cac39526', tool_calls=[{'name': 'add', 'args': {'a': 2, 'b': 5}, 'id': 'call_MZMnEZrae7AuXYtWzH0l9xKL'}], tool_call_chunks=[{'name': 'add', 'args': '{\"a\":2,\"b\":5}', 'id': 'call_MZMnEZrae7AuXYtWzH0l9xKL', 'index': 0}])], tool_call_id='call_MZMnEZrae7AuXYtWzH0l9xKL'),\n", + " 8.2)]}" ] }, - "execution_count": 11, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "correct_df = df.groupby(\"model\")[\"feedback.correctness\"].sum().to_frame(\"# correct\")\n", - "count_df = df.groupby(\"model\").size().to_frame(\"n\")\n", + "from langchain import globals\n", "\n", - "columns = [\n", - " \"feedback.correctness\",\n", - " \"feedback.Intermediate steps correctness\",\n", - " \"execution_time\",\n", - " \"feedback.# steps / # expected steps\",\n", - "]\n", + "globals.set_verbose(True)\n", "\n", - "df.groupby(\"model\")[columns].mean().join(correct_df).join(count_df)" + "agent = agent_factory()\n", + "agent.invoke({\"question\": \"how much is 2+5\"})" ] }, { "cell_type": "markdown", - "id": "c3ac1946-a7cb-4cd2-8de1-d61c46966d06", + "id": "b29a915c-1041-4108-a234-a877b6f59de4", "metadata": {}, "source": [ - "### Individual" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "1798f587-38a1-439e-8c1e-f9eeb3a23c8d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
input.questionmodelactual_stepsreference.expected_stepsoutputs.outputreference.referencefeedback.correctnessnum_expected_steps
example_id
20ea2f0e-b306-474a-8daa-f4386cc16599Add 2 and 3gpt-3.5-turbo-0613[add][add]The sum of 2 and 3 in this alternate mathemati...6.201.01
20ea2f0e-b306-474a-8daa-f4386cc16599Add 2 and 3gpt-3.5-turbo-1106[add][add]The result of adding 2 and 3 is 6.2.6.201.01
20ea2f0e-b306-474a-8daa-f4386cc16599Add 2 and 3gpt-4-0613[add][add]6.26.201.01
2d3e1665-7b3f-4013-b010-6af30ed62ab2I ate 1 apple and 2 oranges every day for 7 da...gpt-3.5-turbo-0613[add, multiply][add, multiply]You ate a total of 32.34 fruits.32.341.02
2d3e1665-7b3f-4013-b010-6af30ed62ab2I ate 1 apple and 2 oranges every day for 7 da...gpt-3.5-turbo-1106[add][add, multiply]You ate 16.2 fruits.32.340.02
\n", - "
" - ], - "text/plain": [ - " input.question \\\n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 Add 2 and 3 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 Add 2 and 3 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 Add 2 and 3 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 I ate 1 apple and 2 oranges every day for 7 da... \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 I ate 1 apple and 2 oranges every day for 7 da... \n", - "\n", - " model actual_steps \\\n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 gpt-3.5-turbo-0613 [add] \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 gpt-3.5-turbo-1106 [add] \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 gpt-4-0613 [add] \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 gpt-3.5-turbo-0613 [add, multiply] \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 gpt-3.5-turbo-1106 [add] \n", - "\n", - " reference.expected_steps \\\n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 [add, multiply] \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 [add, multiply] \n", - "\n", - " outputs.output \\\n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 The sum of 2 and 3 in this alternate mathemati... \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 The result of adding 2 and 3 is 6.2. \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 6.2 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 You ate a total of 32.34 fruits. \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 You ate 16.2 fruits. \n", - "\n", - " reference.reference \\\n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 6.20 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 6.20 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 6.20 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 32.34 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 32.34 \n", - "\n", - " feedback.correctness num_expected_steps \n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 1.0 1 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 1.0 1 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 1.0 1 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 1.0 2 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 0.0 2 " - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "columns = [\n", - " \"input.question\",\n", - " \"model\",\n", - " \"actual_steps\",\n", - " \"reference.expected_steps\",\n", - " \"outputs.output\",\n", - " \"reference.reference\",\n", - " \"feedback.correctness\",\n", - " \"num_expected_steps\",\n", - "]\n", - "df[columns].sort_values(by=[\"input.question\", \"model\"]).head()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "016fbe05-a993-492c-95db-69d3ba756495", - "metadata": { - "tags": [ - "remove-cell" - ] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
input.questionmodelactual_stepsreference.expected_stepsoutputs.outputreference.referencefeedback.correctnessnum_expected_steps
example_id
20ea2f0e-b306-474a-8daa-f4386cc16599Add 2 and 3gpt-3.5-turbo-0613[add][add]The sum of 2 and 3 in this alternate mathemati...6.2000001.01
20ea2f0e-b306-474a-8daa-f4386cc16599Add 2 and 3gpt-3.5-turbo-1106[add][add]The result of adding 2 and 3 is 6.2.6.2000001.01
20ea2f0e-b306-474a-8daa-f4386cc16599Add 2 and 3gpt-4-0613[add][add]6.26.2000001.01
2d3e1665-7b3f-4013-b010-6af30ed62ab2I ate 1 apple and 2 oranges every day for 7 da...gpt-3.5-turbo-0613[add, multiply][add, multiply]You ate a total of 32.34 fruits.32.3400001.02
2d3e1665-7b3f-4013-b010-6af30ed62ab2I ate 1 apple and 2 oranges every day for 7 da...gpt-3.5-turbo-1106[add][add, multiply]You ate 16.2 fruits.32.3400000.02
2d3e1665-7b3f-4013-b010-6af30ed62ab2I ate 1 apple and 2 oranges every day for 7 da...gpt-4-0613[add, multiply][add, multiply]32.3432.3400001.02
c857031a-6ab1-4b06-9638-3a8a4ba69f11Subtract 3 from 2gpt-3.5-turbo-0613[subtract][subtract]The result of subtracting 3 from 2 in this alt...-4.0000001.01
c857031a-6ab1-4b06-9638-3a8a4ba69f11Subtract 3 from 2gpt-3.5-turbo-1106[subtract][subtract]The result of subtracting 3 from 2 is -4.-4.0000001.01
c857031a-6ab1-4b06-9638-3a8a4ba69f11Subtract 3 from 2gpt-4-0613[subtract][subtract]-4.0-4.0000001.01
75db51d4-5c3b-4312-9eb9-b40c74eafdcdWhat is -5 if evaluated using the negate funct...gpt-3.5-turbo-0613[negate][negate]The result of evaluating -5 using the negate f...-5.0000001.01
75db51d4-5c3b-4312-9eb9-b40c74eafdcdWhat is -5 if evaluated using the negate funct...gpt-3.5-turbo-1106[negate][negate]The result of evaluating -5 using the negate f...-5.0000001.01
75db51d4-5c3b-4312-9eb9-b40c74eafdcdWhat is -5 if evaluated using the negate funct...gpt-4-0613[negate][negate]5-5.0000000.01
2a20a13d-050e-4a16-84ff-22d9582f1449after calculating the sin of 1.5 radians, divi...gpt-3.5-turbo-0613[sin, cos, divide][sin, cos, divide]The result of dividing the sine of 1.5 radians...0.0354571.03
2a20a13d-050e-4a16-84ff-22d9582f1449after calculating the sin of 1.5 radians, divi...gpt-3.5-turbo-1106[sin, cos, divide][sin, cos, divide]The result is 0.035457422151326225.0.0354571.03
2a20a13d-050e-4a16-84ff-22d9582f1449after calculating the sin of 1.5 radians, divi...gpt-4-0613[sin, cos, divide][sin, cos, divide]0.0354574221513262250.0354571.03
4ac33c1a-62f0-4da4-9455-07b582f6ff52calculate 101 to the power of 0.5 to 4 digits ...gpt-3.5-turbo-0613[power, power, power, power][power, round]The result of 101 to the power of 0.5 to 4 dig...102518.7812000.02
4ac33c1a-62f0-4da4-9455-07b582f6ff52calculate 101 to the power of 0.5 to 4 digits ...gpt-3.5-turbo-1106[power, power, power][power, round]3.8109e+37102518.7812000.02
4ac33c1a-62f0-4da4-9455-07b582f6ff52calculate 101 to the power of 0.5 to 4 digits ...gpt-4-0613[power][power, round]102519102518.7812000.02
2e82a924-8382-425e-8738-daa2d912e9feconvert 15 degrees to radiansgpt-3.5-turbo-0613[divide][pi, multiply, divide]15 degrees is approximately 0.0417 radians.0.1245880.03
2e82a924-8382-425e-8738-daa2d912e9feconvert 15 degrees to radiansgpt-3.5-turbo-1106[pi, divide][pi, multiply, divide]15 degrees is approximately 0.0417 radians.0.1245880.03
2e82a924-8382-425e-8738-daa2d912e9feconvert 15 degrees to radiansgpt-4-0613[multiply][pi, multiply, divide]0.287979450.1245880.03
67867526-791a-452f-b534-ef2c1f5efd20ecoli divides every 20 minutes. How many cells...gpt-3.5-turbo-0613[divide, power, multiply][divide, power, multiply]After 2 hours, starting with 5 cells, there wi...176.0000001.03
67867526-791a-452f-b534-ef2c1f5efd20ecoli divides every 20 minutes. How many cells...gpt-3.5-turbo-1106[divide, power][divide, power, multiply]After 2 hours, there will be 2187 cells.176.0000000.03
67867526-791a-452f-b534-ef2c1f5efd20ecoli divides every 20 minutes. How many cells...gpt-4-0613[multiply][divide, power, multiply]352.0176.0000000.03
27c44572-6c67-4129-a95a-fe1509c350bemultiply the result of (log of 100 to base 10)...gpt-3.5-turbo-0613[log, multiply][log, multiply]The result of multiplying the logarithm of 100...6.2223191.02
27c44572-6c67-4129-a95a-fe1509c350bemultiply the result of (log of 100 to base 10)...gpt-3.5-turbo-1106[log, multiply][log, multiply]The result is 6.2223186933233666.2223191.02
27c44572-6c67-4129-a95a-fe1509c350bemultiply the result of (log of 100 to base 10)...gpt-4-0613[multiply][log, multiply]19.86.2223190.02
dd079541-c0da-4d94-85b7-50f0516a9ca1what is the result of 2 to the power of 3?gpt-3.5-turbo-0613[power][power]The result of 2 to the power of 3 is 32.32.0000001.01
dd079541-c0da-4d94-85b7-50f0516a9ca1what is the result of 2 to the power of 3?gpt-3.5-turbo-1106[power][power]The result of 2 to the power of 3 is 32.32.0000001.01
dd079541-c0da-4d94-85b7-50f0516a9ca1what is the result of 2 to the power of 3?gpt-4-0613[power][power]32.032.0000001.01
\n", - "
" - ], - "text/plain": [ - " input.question \\\n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 Add 2 and 3 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 Add 2 and 3 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 Add 2 and 3 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 I ate 1 apple and 2 oranges every day for 7 da... \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 I ate 1 apple and 2 oranges every day for 7 da... \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 I ate 1 apple and 2 oranges every day for 7 da... \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 Subtract 3 from 2 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 Subtract 3 from 2 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 Subtract 3 from 2 \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd What is -5 if evaluated using the negate funct... \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd What is -5 if evaluated using the negate funct... \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd What is -5 if evaluated using the negate funct... \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 after calculating the sin of 1.5 radians, divi... \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 after calculating the sin of 1.5 radians, divi... \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 after calculating the sin of 1.5 radians, divi... \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 calculate 101 to the power of 0.5 to 4 digits ... \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 calculate 101 to the power of 0.5 to 4 digits ... \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 calculate 101 to the power of 0.5 to 4 digits ... \n", - "2e82a924-8382-425e-8738-daa2d912e9fe convert 15 degrees to radians \n", - "2e82a924-8382-425e-8738-daa2d912e9fe convert 15 degrees to radians \n", - "2e82a924-8382-425e-8738-daa2d912e9fe convert 15 degrees to radians \n", - "67867526-791a-452f-b534-ef2c1f5efd20 ecoli divides every 20 minutes. How many cells... \n", - "67867526-791a-452f-b534-ef2c1f5efd20 ecoli divides every 20 minutes. How many cells... \n", - "67867526-791a-452f-b534-ef2c1f5efd20 ecoli divides every 20 minutes. How many cells... \n", - "27c44572-6c67-4129-a95a-fe1509c350be multiply the result of (log of 100 to base 10)... \n", - "27c44572-6c67-4129-a95a-fe1509c350be multiply the result of (log of 100 to base 10)... \n", - "27c44572-6c67-4129-a95a-fe1509c350be multiply the result of (log of 100 to base 10)... \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 what is the result of 2 to the power of 3? \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 what is the result of 2 to the power of 3? \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 what is the result of 2 to the power of 3? \n", - "\n", - " model \\\n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 gpt-3.5-turbo-0613 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 gpt-3.5-turbo-1106 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 gpt-4-0613 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 gpt-3.5-turbo-0613 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 gpt-3.5-turbo-1106 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 gpt-4-0613 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 gpt-3.5-turbo-0613 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 gpt-3.5-turbo-1106 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 gpt-4-0613 \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd gpt-3.5-turbo-0613 \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd gpt-3.5-turbo-1106 \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd gpt-4-0613 \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 gpt-3.5-turbo-0613 \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 gpt-3.5-turbo-1106 \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 gpt-4-0613 \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 gpt-3.5-turbo-0613 \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 gpt-3.5-turbo-1106 \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 gpt-4-0613 \n", - "2e82a924-8382-425e-8738-daa2d912e9fe gpt-3.5-turbo-0613 \n", - "2e82a924-8382-425e-8738-daa2d912e9fe gpt-3.5-turbo-1106 \n", - "2e82a924-8382-425e-8738-daa2d912e9fe gpt-4-0613 \n", - "67867526-791a-452f-b534-ef2c1f5efd20 gpt-3.5-turbo-0613 \n", - "67867526-791a-452f-b534-ef2c1f5efd20 gpt-3.5-turbo-1106 \n", - "67867526-791a-452f-b534-ef2c1f5efd20 gpt-4-0613 \n", - "27c44572-6c67-4129-a95a-fe1509c350be gpt-3.5-turbo-0613 \n", - "27c44572-6c67-4129-a95a-fe1509c350be gpt-3.5-turbo-1106 \n", - "27c44572-6c67-4129-a95a-fe1509c350be gpt-4-0613 \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 gpt-3.5-turbo-0613 \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 gpt-3.5-turbo-1106 \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 gpt-4-0613 \n", - "\n", - " actual_steps \\\n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 [add, multiply] \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 [add] \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 [add, multiply] \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 [subtract] \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 [subtract] \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 [subtract] \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd [negate] \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd [negate] \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd [negate] \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 [sin, cos, divide] \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 [sin, cos, divide] \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 [sin, cos, divide] \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 [power, power, power, power] \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 [power, power, power] \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 [power] \n", - "2e82a924-8382-425e-8738-daa2d912e9fe [divide] \n", - "2e82a924-8382-425e-8738-daa2d912e9fe [pi, divide] \n", - "2e82a924-8382-425e-8738-daa2d912e9fe [multiply] \n", - "67867526-791a-452f-b534-ef2c1f5efd20 [divide, power, multiply] \n", - "67867526-791a-452f-b534-ef2c1f5efd20 [divide, power] \n", - "67867526-791a-452f-b534-ef2c1f5efd20 [multiply] \n", - "27c44572-6c67-4129-a95a-fe1509c350be [log, multiply] \n", - "27c44572-6c67-4129-a95a-fe1509c350be [log, multiply] \n", - "27c44572-6c67-4129-a95a-fe1509c350be [multiply] \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 [power] \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 [power] \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 [power] \n", - "\n", - " reference.expected_steps \\\n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 [add, multiply] \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 [add, multiply] \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 [add, multiply] \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 [subtract] \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 [subtract] \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 [subtract] \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd [negate] \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd [negate] \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd [negate] \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 [sin, cos, divide] \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 [sin, cos, divide] \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 [sin, cos, divide] \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 [power, round] \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 [power, round] \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 [power, round] \n", - "2e82a924-8382-425e-8738-daa2d912e9fe [pi, multiply, divide] \n", - "2e82a924-8382-425e-8738-daa2d912e9fe [pi, multiply, divide] \n", - "2e82a924-8382-425e-8738-daa2d912e9fe [pi, multiply, divide] \n", - "67867526-791a-452f-b534-ef2c1f5efd20 [divide, power, multiply] \n", - "67867526-791a-452f-b534-ef2c1f5efd20 [divide, power, multiply] \n", - "67867526-791a-452f-b534-ef2c1f5efd20 [divide, power, multiply] \n", - "27c44572-6c67-4129-a95a-fe1509c350be [log, multiply] \n", - "27c44572-6c67-4129-a95a-fe1509c350be [log, multiply] \n", - "27c44572-6c67-4129-a95a-fe1509c350be [log, multiply] \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 [power] \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 [power] \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 [power] \n", - "\n", - " outputs.output \\\n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 The sum of 2 and 3 in this alternate mathemati... \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 The result of adding 2 and 3 is 6.2. \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 6.2 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 You ate a total of 32.34 fruits. \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 You ate 16.2 fruits. \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 32.34 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 The result of subtracting 3 from 2 in this alt... \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 The result of subtracting 3 from 2 is -4. \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 -4.0 \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd The result of evaluating -5 using the negate f... \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd The result of evaluating -5 using the negate f... \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd 5 \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 The result of dividing the sine of 1.5 radians... \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 The result is 0.035457422151326225. \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 0.035457422151326225 \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 The result of 101 to the power of 0.5 to 4 dig... \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 3.8109e+37 \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 102519 \n", - "2e82a924-8382-425e-8738-daa2d912e9fe 15 degrees is approximately 0.0417 radians. \n", - "2e82a924-8382-425e-8738-daa2d912e9fe 15 degrees is approximately 0.0417 radians. \n", - "2e82a924-8382-425e-8738-daa2d912e9fe 0.28797945 \n", - "67867526-791a-452f-b534-ef2c1f5efd20 After 2 hours, starting with 5 cells, there wi... \n", - "67867526-791a-452f-b534-ef2c1f5efd20 After 2 hours, there will be 2187 cells. \n", - "67867526-791a-452f-b534-ef2c1f5efd20 352.0 \n", - "27c44572-6c67-4129-a95a-fe1509c350be The result of multiplying the logarithm of 100... \n", - "27c44572-6c67-4129-a95a-fe1509c350be The result is 6.222318693323366 \n", - "27c44572-6c67-4129-a95a-fe1509c350be 19.8 \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 The result of 2 to the power of 3 is 32. \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 The result of 2 to the power of 3 is 32. \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 32.0 \n", - "\n", - " reference.reference \\\n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 6.200000 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 6.200000 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 6.200000 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 32.340000 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 32.340000 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 32.340000 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 -4.000000 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 -4.000000 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 -4.000000 \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd -5.000000 \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd -5.000000 \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd -5.000000 \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 0.035457 \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 0.035457 \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 0.035457 \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 102518.781200 \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 102518.781200 \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 102518.781200 \n", - "2e82a924-8382-425e-8738-daa2d912e9fe 0.124588 \n", - "2e82a924-8382-425e-8738-daa2d912e9fe 0.124588 \n", - "2e82a924-8382-425e-8738-daa2d912e9fe 0.124588 \n", - "67867526-791a-452f-b534-ef2c1f5efd20 176.000000 \n", - "67867526-791a-452f-b534-ef2c1f5efd20 176.000000 \n", - "67867526-791a-452f-b534-ef2c1f5efd20 176.000000 \n", - "27c44572-6c67-4129-a95a-fe1509c350be 6.222319 \n", - "27c44572-6c67-4129-a95a-fe1509c350be 6.222319 \n", - "27c44572-6c67-4129-a95a-fe1509c350be 6.222319 \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 32.000000 \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 32.000000 \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 32.000000 \n", - "\n", - " feedback.correctness num_expected_steps \n", - "example_id \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 1.0 1 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 1.0 1 \n", - "20ea2f0e-b306-474a-8daa-f4386cc16599 1.0 1 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 1.0 2 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 0.0 2 \n", - "2d3e1665-7b3f-4013-b010-6af30ed62ab2 1.0 2 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 1.0 1 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 1.0 1 \n", - "c857031a-6ab1-4b06-9638-3a8a4ba69f11 1.0 1 \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd 1.0 1 \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd 1.0 1 \n", - "75db51d4-5c3b-4312-9eb9-b40c74eafdcd 0.0 1 \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 1.0 3 \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 1.0 3 \n", - "2a20a13d-050e-4a16-84ff-22d9582f1449 1.0 3 \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 0.0 2 \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 0.0 2 \n", - "4ac33c1a-62f0-4da4-9455-07b582f6ff52 0.0 2 \n", - "2e82a924-8382-425e-8738-daa2d912e9fe 0.0 3 \n", - "2e82a924-8382-425e-8738-daa2d912e9fe 0.0 3 \n", - "2e82a924-8382-425e-8738-daa2d912e9fe 0.0 3 \n", - "67867526-791a-452f-b534-ef2c1f5efd20 1.0 3 \n", - "67867526-791a-452f-b534-ef2c1f5efd20 0.0 3 \n", - "67867526-791a-452f-b534-ef2c1f5efd20 0.0 3 \n", - "27c44572-6c67-4129-a95a-fe1509c350be 1.0 2 \n", - "27c44572-6c67-4129-a95a-fe1509c350be 1.0 2 \n", - "27c44572-6c67-4129-a95a-fe1509c350be 0.0 2 \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 1.0 1 \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 1.0 1 \n", - "dd079541-c0da-4d94-85b7-50f0516a9ca1 1.0 1 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[columns].sort_values(by=[\"input.question\", \"model\"])" + "## Benchmarking\n", + "\n", + "See `introduction` and `benchmark all` for information on how to run benchmarks. This notebook is just to here to explain and explore the task." ] } ], diff --git a/docs/source/notebooks/tool_usage/relational_data.ipynb b/docs/source/notebooks/tool_usage/relational_data.ipynb index 1c781d7..5a4b253 100644 --- a/docs/source/notebooks/tool_usage/relational_data.ipynb +++ b/docs/source/notebooks/tool_usage/relational_data.ipynb @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "27b6b0fd-639d-43a7-a730-9acdc5b2f102", "metadata": { "tags": [] @@ -97,14 +97,14 @@ { "data": { "text/plain": [ - "[StructuredTool(name='get_user_name', description=\"get_user_name(user_id: int) -> str - Get the name of the user with the given user ID.\\n\\n Args:\\n user_id: The user's ID.\\n\\n Returns:\\n The user's name.\", args_schema=, handle_tool_error=True, func=.get_user_name at 0x7fbb0e864f40>),\n", - " StructuredTool(name='list_user_ids', description='list_user_ids() -> List[str] - List all the user IDs.', args_schema=, handle_tool_error=True, func=.list_user_ids at 0x7fbb0e864fe0>),\n", - " StructuredTool(name='find_users_by_name', description='find_users_by_name(name: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find users with the given name.\\n\\n Args:\\n name: The name to search for.\\n\\n Returns:\\n The list of matching users.', args_schema=, handle_tool_error=True, func=.find_users_by_name at 0x7fbb0e865080>),\n", - " StructuredTool(name='find_locations_by_name', description='find_locations_by_name(city: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find locations with the given city name.', args_schema=, handle_tool_error=True, func=.find_locations_by_name at 0x7fbb0e865120>),\n", - " StructuredTool(name='find_foods_by_name', description='find_foods_by_name(food: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find foods with the given name.', args_schema=, handle_tool_error=True, func=.find_foods_by_name at 0x7fbb0e8651c0>)]" + "[StructuredTool(name='get_user_name', description=\"get_user_name(user_id: int) -> str - Get the name of the user with the given user ID.\\n\\n Args:\\n user_id: The user's ID.\\n\\n Returns:\\n The user's name.\", args_schema=, handle_tool_error=True, func=.get_user_name at 0x78f30602fec0>),\n", + " StructuredTool(name='list_user_ids', description='list_user_ids() -> List[str] - List all the user IDs.', args_schema=, handle_tool_error=True, func=.list_user_ids at 0x78f30602fe20>),\n", + " StructuredTool(name='find_users_by_name', description='find_users_by_name(name: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find users with the given name.\\n\\n Args:\\n name: The name to search for.\\n\\n Returns:\\n The list of matching users.', args_schema=, handle_tool_error=True, func=.find_users_by_name at 0x78f306058040>),\n", + " StructuredTool(name='find_locations_by_name', description='find_locations_by_name(city: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find locations with the given city name.', args_schema=, handle_tool_error=True, func=.find_locations_by_name at 0x78f3060580e0>),\n", + " StructuredTool(name='find_foods_by_name', description='find_foods_by_name(food: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find foods with the given name.', args_schema=, handle_tool_error=True, func=.find_foods_by_name at 0x78f306058180>)]" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "7f1c1242-449c-4536-863d-b62bf6d2dff1", "metadata": { "tags": [] @@ -128,7 +128,7 @@ "'Bob'" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -139,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "854e139b-a120-4012-bdf4-6394e0b1c42d", "metadata": { "tags": [] @@ -155,7 +155,7 @@ " {'id': 5, 'city': 'Miami'}]" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -169,105 +169,46 @@ "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28", "metadata": {}, "source": [ - "## Agent Factory\n", + "## Explore the task\n", "\n", "For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n", "\n", - "The `AgentExecutor` should accept `question` as an input and include the fields `output`, `intermediate_steps` and potentially `state` in its response -- for this we\n", - "will wrap the agent executor in an adapter (`apply_agent_executor_adapter`) that will help match the expected schema.\n", - "\n", - "Please reference the LangChain documentation to see how to [use and implement agents](https://python.langchain.com/docs/modules/agents/)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "1c2d80d2-4ddf-4b80-b6c5-331133a85314", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.agents import AgentType, initialize_agent\n", - "from langchain.chat_models import ChatOpenAI\n", - "\n", - "from langchain_benchmarks.schema import ExtractionTask\n", - "from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter" + "We'll use the `StandardAgentFactory` -- look at the `intro` for more information about what it does and/or how to create a custom one." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "81c0e4a1-f56e-4117-8804-4161c642b068", "metadata": { "tags": [] }, "outputs": [], "source": [ - "class AgentFactory:\n", - " def __init__(self, task: ExtractionTask, model: str) -> None:\n", - " self.task = task\n", - " self.model = model\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_openai.chat_models import ChatOpenAI\n", "\n", - " def __call__(self):\n", - " # This factory creates a new environment for every agent run.\n", - " # The reason is that the environment may be associated with an environment state (e.g., typewriter)\n", - " # which is changed by the actions of the agent.\n", - " # At the end of the run, the environment state will be read.\n", - " env = task.create_environment() # Create a new environment for every agent run!\n", - " tools = env.tools\n", - " llm = ChatOpenAI(temperature=0, model=self.model)\n", - " agent_executor = initialize_agent(\n", - " tools,\n", - " llm,\n", - " agent=AgentType.OPENAI_FUNCTIONS,\n", - " return_intermediate_steps=True,\n", - " handle_parsing_errors=True,\n", - " )\n", - " # Apply the adapters so that inputs and outputs match dataset schema\n", - " # state_reader automatically adds the state of the environment at the end of the run.\n", - " return apply_agent_executor_adapter(agent_executor, state_reader=env.read_state)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "0ae8c6be-899c-44a6-a89b-0fc04c2cb05c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "models = [\"gpt-3.5-turbo-1106\", \"gpt-3.5-turbo-0613\", \"gpt-4-32k-0613\"]\n", - "agent_factory = AgentFactory(task, models[0])" - ] - }, - { - "cell_type": "markdown", - "id": "87a64f76-65ae-4367-b43f-f2be3431e7af", - "metadata": {}, - "source": [ - "Let's test that our agent works" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "127a8aa5-839c-469c-a870-7b498f37c187", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain import globals\n", + "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n", + "\n", + "model = ChatOpenAI(temperature=0)\n", + "prompt = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", \"{instructions}\"), # Populated from task.instructions automatically\n", + " (\"human\", \"{question}\"), # Populated from the test data\n", + " (\n", + " \"placeholder\",\n", + " \"{agent_scratchpad}\",\n", + " ), # Work where the agent can do its work (e.g., call multiple tools)\n", + " ]\n", + ")\n", "\n", - "globals.set_verbose(True)" + "agent_factory = StandardAgentFactory(task, model, prompt)" ] }, { "cell_type": "code", "execution_count": 11, - "id": "0e4896fa-3633-44a1-857f-80a263cf2e03", + "id": "382ff2f6-8099-415e-a58c-e659345f52fc", "metadata": { "tags": [] }, @@ -280,11 +221,11 @@ "\n", "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", "\u001b[32;1m\u001b[1;3m\n", - "Invoking: `find_locations_by_name` with `{'city': 'Los Angeles'}`\n", + "Invoking: `find_locations_by_name` with `{'city': 'LA'}`\n", "\n", "\n", - "\u001b[0m\u001b[36;1m\u001b[1;3m[{'id': 2, 'city': 'Los Angeles'}, {'id': 4, 'city': 'Houston'}, {'id': 1, 'city': 'New York'}, {'id': 3, 'city': 'Chicago'}, {'id': 5, 'city': 'Miami'}]\u001b[0m\u001b[32;1m\u001b[1;3m\n", - "Invoking: `get_weather_at_location` with `{'location_id': 2}`\n", + "\u001b[0m\u001b[36;1m\u001b[1;3m[{'id': 2, 'city': 'Los Angeles'}, {'id': 1, 'city': 'New York'}, {'id': 3, 'city': 'Chicago'}, {'id': 4, 'city': 'Houston'}, {'id': 5, 'city': 'Miami'}]\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `get_current_weather_for_location` with `{'location_id': 2}`\n", "\n", "\n", "\u001b[0m\u001b[36;1m\u001b[1;3mSunny, Temperature: 75°F\u001b[0m\u001b[32;1m\u001b[1;3mThe weather in Los Angeles is sunny with a temperature of 75°F.\u001b[0m\n", @@ -295,15 +236,15 @@ { "data": { "text/plain": [ - "{'input': 'whats the weather in LA?',\n", + "{'question': 'what is the weather in LA',\n", " 'output': 'The weather in Los Angeles is sunny with a temperature of 75°F.',\n", - " 'intermediate_steps': [(AgentActionMessageLog(tool='find_locations_by_name', tool_input={'city': 'Los Angeles'}, log=\"\\nInvoking: `find_locations_by_name` with `{'city': 'Los Angeles'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\"city\":\"Los Angeles\"}', 'name': 'find_locations_by_name'}})]),\n", + " 'intermediate_steps': [(ToolAgentAction(tool='find_locations_by_name', tool_input={'city': 'LA'}, log=\"\\nInvoking: `find_locations_by_name` with `{'city': 'LA'}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_hJrCZgP4eDgaj6s4RtCKXTOo', 'function': {'arguments': '{\"city\":\"LA\"}', 'name': 'find_locations_by_name'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-23ccffb0-3b17-46a4-b42e-5eaa3220b211', tool_calls=[{'name': 'find_locations_by_name', 'args': {'city': 'LA'}, 'id': 'call_hJrCZgP4eDgaj6s4RtCKXTOo'}], tool_call_chunks=[{'name': 'find_locations_by_name', 'args': '{\"city\":\"LA\"}', 'id': 'call_hJrCZgP4eDgaj6s4RtCKXTOo', 'index': 0}])], tool_call_id='call_hJrCZgP4eDgaj6s4RtCKXTOo'),\n", " [{'id': 2, 'city': 'Los Angeles'},\n", - " {'id': 4, 'city': 'Houston'},\n", " {'id': 1, 'city': 'New York'},\n", " {'id': 3, 'city': 'Chicago'},\n", + " {'id': 4, 'city': 'Houston'},\n", " {'id': 5, 'city': 'Miami'}]),\n", - " (AgentActionMessageLog(tool='get_weather_at_location', tool_input={'location_id': 2}, log=\"\\nInvoking: `get_weather_at_location` with `{'location_id': 2}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\"location_id\":2}', 'name': 'get_weather_at_location'}})]),\n", + " (ToolAgentAction(tool='get_current_weather_for_location', tool_input={'location_id': 2}, log=\"\\nInvoking: `get_current_weather_for_location` with `{'location_id': 2}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_lopYjo00MF9mZtnHtiisTqyp', 'function': {'arguments': '{\"location_id\":2}', 'name': 'get_current_weather_for_location'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9bba5827-d98b-464d-8028-25eb4a05d227', tool_calls=[{'name': 'get_current_weather_for_location', 'args': {'location_id': 2}, 'id': 'call_lopYjo00MF9mZtnHtiisTqyp'}], tool_call_chunks=[{'name': 'get_current_weather_for_location', 'args': '{\"location_id\":2}', 'id': 'call_lopYjo00MF9mZtnHtiisTqyp', 'index': 0}])], tool_call_id='call_lopYjo00MF9mZtnHtiisTqyp'),\n", " 'Sunny, Temperature: 75°F')]}" ] }, @@ -313,270 +254,31 @@ } ], "source": [ - "agent = agent_factory()\n", - "agent.invoke({\"question\": \"whats the weather in LA?\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "43edee23-109d-4f75-be68-d2b4b3240c9b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "globals.set_verbose(False)" - ] - }, - { - "cell_type": "markdown", - "id": "3821e4b0-8e67-418a-840c-470fcde42df0", - "metadata": {}, - "source": [ - "## Eval\n", - "\n", - "Let's evaluate an agent now" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "2e02fb65-eecf-43b8-bf76-1e86ca535da0", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "View the evaluation results for project 'tool-usage-relational-data-gpt-3.5-turbo-1106-8258' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/8aae8e36-720a-42c8-8540-5d5475e7181e?eval=true\n", - "\n", - "View all tests for Dataset Tool Usage - Relational Data at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n", - "[------------------------------------------------->] 21/21\n", - "View the evaluation results for project 'tool-usage-relational-data-gpt-3.5-turbo-0613-8258' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/d8773df1-b054-41e4-a947-7b256ca8738b?eval=true\n", - "\n", - "View all tests for Dataset Tool Usage - Relational Data at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n", - "[------------------------------------------------->] 21/21\n", - "View the evaluation results for project 'tool-usage-relational-data-gpt-4-0613-8258' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/090fecae-923f-4281-93f7-2c5253a2a2a4?eval=true\n", - "\n", - "View all tests for Dataset Tool Usage - Relational Data at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n", - "[------------------------------------------------->] 21/21" - ] - } - ], - "source": [ - "import uuid\n", - "\n", - "from langsmith.client import Client\n", - "\n", - "from langchain_benchmarks import clone_public_dataset\n", - "\n", - "clone_public_dataset(task.dataset_id, dataset_name=task.name) # Clone dataset\n", - "\n", - "experiment_uuid = uuid.uuid4().hex[:4]\n", - "\n", - "client = Client()\n", + "from langchain import globals\n", "\n", - "models = [\"gpt-3.5-turbo-1106\", \"gpt-3.5-turbo-0613\", \"gpt-4-0613\"]\n", + "globals.set_verbose(True)\n", "\n", - "for model in models:\n", - " print()\n", - " agent_factory = AgentFactory(task, model=model)\n", - " test_run = client.run_on_dataset(\n", - " dataset_name=task.name,\n", - " llm_or_chain_factory=agent_factory,\n", - " evaluation=task.get_eval_config(),\n", - " verbose=False,\n", - " project_name=f\"tool-usage-relational-data-{model}-{experiment_uuid}\",\n", - " tags=[model],\n", - " project_metadata={\n", - " \"model\": model,\n", - " \"arch\": \"openai-functions-agent\",\n", - " \"id\": experiment_uuid,\n", - " },\n", - " )" + "agent = agent_factory()\n", + "agent.invoke({\"question\": \"what is the weather in LA\"})" ] }, { "cell_type": "markdown", - "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd", + "id": "142ac640-3ce0-4f38-89cd-8d24d65997e4", "metadata": {}, "source": [ - "## Inspect\n", + "## Benchmarking\n", "\n", - "Here, we'll take a look at the underlying results a little bit." + "See `introduction` and `benchmark all` for information on how to run benchmarks. This notebook is just to here to explain and explore the task." ] }, { "cell_type": "code", - "execution_count": 24, - "id": "fe9b20c4-9da0-47a2-95a3-b5660a54855a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from langsmith.client import Client\n", - "\n", - "client = Client()\n", - "projects = list(\n", - " client.list_projects(reference_dataset_name=\"Tool Usage - Relational Data\")\n", - ")\n", - "\n", - "dfs = []\n", - "for project in projects:\n", - " first_root_run = next(\n", - " client.list_runs(project_name=project.name, execution_order=1)\n", - " )\n", - " # Temporary way to get tag information\n", - " tags = first_root_run.tags\n", - " test_results = client.get_test_results(project_name=project.name)\n", - " test_results[\"model\"] = tags[0]\n", - " dfs.append(test_results)\n", - "\n", - "\n", - "df = pd.concat(dfs)" - ] - }, - { - "cell_type": "markdown", - "id": "da6962a1-81f2-445f-8547-513a105a3847", - "metadata": {}, - "source": [ - "### Stats" - ] - }, - { - "cell_type": "markdown", - "id": "4b7d366a-8754-417a-a654-956528f134e2", + "execution_count": null, + "id": "e49455cc-13c5-4ea6-bb4b-e61c39ea0267", "metadata": {}, - "source": [ - "In terms of function usage, gpt-4 uses more calls than is strictly necessary (`feedback.# steps / # expected steps` is > 1). However, it's doing a pretty good job.\n", - "\n", - "The gpt-3.5 models do not use tools enough (`feedback.# steps / # expected steps` is < 1) and as a result do a worse job at the task.\n", - "\n", - "Note: The intermediate step correctness happens to have the same average for the 3 models -- this is just a coincidence you can confirm by inspecting underlying results." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "066551f2-eb30-4bc1-94fd-0ca0085103ad", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
feedback.correctnessfeedback.Intermediate steps correctnessexecution_timefeedback.# steps / # expected stepsn
model
gpt-3.5-turbo-06130.7142860.7142864.8295060.82539021
gpt-3.5-turbo-11060.8571430.7142865.4642180.96587121
gpt-4-06130.9523810.7142868.5443581.03730021
\n", - "
" - ], - "text/plain": [ - " feedback.correctness \\\n", - "model \n", - "gpt-3.5-turbo-0613 0.714286 \n", - "gpt-3.5-turbo-1106 0.857143 \n", - "gpt-4-0613 0.952381 \n", - "\n", - " feedback.Intermediate steps correctness execution_time \\\n", - "model \n", - "gpt-3.5-turbo-0613 0.714286 4.829506 \n", - "gpt-3.5-turbo-1106 0.714286 5.464218 \n", - "gpt-4-0613 0.714286 8.544358 \n", - "\n", - " feedback.# steps / # expected steps n \n", - "model \n", - "gpt-3.5-turbo-0613 0.825390 21 \n", - "gpt-3.5-turbo-1106 0.965871 21 \n", - "gpt-4-0613 1.037300 21 " - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "count_df = df.groupby(\"model\").size().to_frame(\"n\")\n", - "df.groupby(\"model\")[\n", - " [\n", - " \"feedback.correctness\",\n", - " \"feedback.Intermediate steps correctness\",\n", - " \"execution_time\",\n", - " \"feedback.# steps / # expected steps\",\n", - " ]\n", - "].mean().join(count_df)" - ] + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs/source/notebooks/tool_usage/typewriter_1.ipynb b/docs/source/notebooks/tool_usage/typewriter_1.ipynb index 9f1a2d2..93a21b3 100644 --- a/docs/source/notebooks/tool_usage/typewriter_1.ipynb +++ b/docs/source/notebooks/tool_usage/typewriter_1.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 1, "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", "metadata": { "tags": [] @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece", "metadata": { "tags": [] @@ -60,10 +60,10 @@ "" ], "text/plain": [ - "ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'})" + "ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'})" ] }, - "execution_count": 13, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -110,7 +110,7 @@ { "data": { "text/plain": [ - "[StructuredTool(name='type_letter', description='type_letter(letter: str) -> str - Print the given letter on the paper.', args_schema=, func=.type_letter at 0x7f1791bd3f60>)]" + "[StructuredTool(name='type_letter', description='type_letter(letter: str) -> str - Print the given letter on the paper.', args_schema=, func=.type_letter at 0x73a65909ee80>)]" ] }, "execution_count": 4, @@ -208,1404 +208,119 @@ "id": "cd13d120-1bf9-481c-9392-c15ebdd9d77f", "metadata": {}, "source": [ - "## Agent Factory\n", + "## Explore the task\n", "\n", "For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n", "\n", - "We'll use an `OpenAIAgentFactory` provided with LangChain Benchmarks -- look at the `intro` section to see how to define your own." + "We'll use the `StandardAgentFactory` -- look at the `intro` for more information about what it does and/or how to create a custom one." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 23, "id": "e2acab1e-78a7-4198-8e79-4529c95ce7e2", "metadata": { "tags": [] }, "outputs": [], "source": [ - "from langchain_benchmarks.tool_usage import agents\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_openai.chat_models import ChatOpenAI\n", + "\n", + "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n", "\n", - "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n", + "model = ChatOpenAI(temperature=0)\n", + "prompt = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", \"{instructions}\"), # Populated from task.instructions automatically\n", + " (\"human\", \"{question}\"), # Populated from the test data\n", + " (\n", + " \"placeholder\",\n", + " \"{agent_scratchpad}\",\n", + " ), # Work where the agent can do its work (e.g., call multiple tools)\n", + " ]\n", + ")\n", "\n", - "# Let's test that our agent works\n", - "agent = agent_factory()" + "agent_factory = StandardAgentFactory(task, model, prompt)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 25, "id": "ceaa8edf-292b-48a1-be94-e6bfea0e75b1", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input': 'abc',\n", - " 'output': 'a, b, c',\n", - " 'intermediate_steps': [(OpenAIToolAgentAction(tool='type_letter', tool_input={'letter': 'a'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'a'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_uXlSgkG7N9nBCjYPB6SZn0n4', 'function': {'arguments': '{\\n \"letter\": \"a\"\\n}', 'name': 'type_letter'}, 'type': 'function'}]})], tool_call_id='call_uXlSgkG7N9nBCjYPB6SZn0n4'),\n", - " 'OK'),\n", - " (OpenAIToolAgentAction(tool='type_letter', tool_input={'letter': 'b'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'b'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_dbSJb120AxFn55XcJHR0xH1I', 'function': {'arguments': '{\\n \"letter\": \"b\"\\n}', 'name': 'type_letter'}, 'type': 'function'}]})], tool_call_id='call_dbSJb120AxFn55XcJHR0xH1I'),\n", - " 'OK'),\n", - " (OpenAIToolAgentAction(tool='type_letter', tool_input={'letter': 'c'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'c'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_sFV4km9Jd9BOGO7A3oo1op0b', 'function': {'arguments': '{\\n \"letter\": \"c\"\\n}', 'name': 'type_letter'}, 'type': 'function'}]})], tool_call_id='call_sFV4km9Jd9BOGO7A3oo1op0b'),\n", - " 'OK')],\n", - " 'state': 'abc'}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "agent.invoke({\"question\": \"abc\"})" - ] - }, - { - "cell_type": "markdown", - "id": "3821e4b0-8e67-418a-840c-470fcde42df0", - "metadata": {}, - "source": [ - "## Eval" - ] - }, - { - "cell_type": "markdown", - "id": "bc860fc6-89db-4929-926a-69b6320616ab", - "metadata": {}, - "source": [ - "Let's evaluate an agent now" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432", - "metadata": { - "tags": [] - }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", - "View the evaluation results for project 'typewriter-1-gpt-3.5-turbo-1106-7709' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/d29cf7d9-9cfa-4fcd-8380-8c339b940972?eval=true\n", "\n", - "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n", - "[------------------------------------------------->] 20/20\n", - "View the evaluation results for project 'typewriter-1-gpt-3.5-turbo-0613-7709' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/044be5ad-0871-4b08-bf5c-1dd6ba94f53b?eval=true\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m\n", + "Invoking: `type_letter` with `{'letter': 'a'}`\n", + "\n", "\n", - "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n", - "[------------------------------------------------->] 20/20\n", - "View the evaluation results for project 'typewriter-1-gpt-4-0613-7709' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/b7ec2c5f-2a28-4bf7-828e-7a65ea5984be?eval=true\n", + "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `type_letter` with `{'letter': 'b'}`\n", "\n", - "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n", - "[------------------------------------------------->] 20/20" + "\n", + "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `type_letter` with `{'letter': 'c'}`\n", + "\n", + "\n", + "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3mabc\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" ] - } - ], - "source": [ - "import uuid\n", - "\n", - "from langsmith.client import Client\n", - "\n", - "from langchain_benchmarks import clone_public_dataset\n", - "\n", - "experiment_uuid = uuid.uuid4().hex[:4]\n", - "clone_public_dataset(task.dataset_id, dataset_name=task.name)\n", - "\n", - "\n", - "client = Client()\n", - "\n", - "models = [\"gpt-3.5-turbo-1106\", \"gpt-3.5-turbo-0613\", \"gpt-4-0613\"]\n", - "\n", - "for model in models:\n", - " # Will evaluate the trajectory and state, but not the output which is meaningless for this task.\n", - " print()\n", - " agent_factory = agents.OpenAIAgentFactory(task, model=model)\n", - " test_run = client.run_on_dataset(\n", - " dataset_name=task.name,\n", - " llm_or_chain_factory=agent_factory,\n", - " evaluation=task.get_eval_config(),\n", - " verbose=False,\n", - " project_name=f\"typewriter-1-{model}-{experiment_uuid}\",\n", - " tags=[model],\n", - " project_metadata={\n", - " \"model\": model,\n", - " \"arch\": \"openai-functions-agent\",\n", - " \"id\": experiment_uuid,\n", - " },\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd", - "metadata": {}, - "source": [ - "## Inspect\n", - "\n", - "You can take a look at the underlying results." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "72c07e4d-3e3b-4838-81d4-98d2e7cfe8d7", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from langsmith.client import Client\n", - "\n", - "client = Client()\n", - "projects = list(\n", - " client.list_projects(reference_dataset_name=\"Tool Usage - Typewriter (1 tool)\")\n", - ")\n", - "\n", - "dfs = []\n", - "for project in projects:\n", - " first_root_run = next(\n", - " client.list_runs(project_name=project.name, execution_order=1)\n", - " )\n", - " # Temporary way to get tag information\n", - " tags = first_root_run.tags\n", - " test_results = client.get_test_results(project_name=project.name)\n", - " test_results[\"model\"] = tags[0]\n", - " dfs.append(test_results)\n", - "\n", - "\n", - "df = pd.concat(dfs)\n", - "\n", - "df[\"actual_steps\"] = df[\"outputs.intermediate_steps\"].apply(\n", - " lambda steps: [step[0][\"tool\"] for step in steps]\n", - ")\n", - "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n", - "df[\"num_actual_steps\"] = df[\"actual_steps\"].apply(len)" - ] - }, - { - "cell_type": "markdown", - "id": "e7b027a9-87ef-4a97-8b91-43eb82671c6c", - "metadata": {}, - "source": [ - "### Stats\n", - "\n", - "This is a simple task that involves using a single tool that takes only one argument (which character to type).\n", - "\n", - "Given the simplicity of the task, we expect that all models will be able to do well at this task (ideally at 100%)." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "b511c2af-9261-46fb-8f29-b8491b198e87", - "metadata": { - "tags": [] - }, - "outputs": [ + }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
feedback.Correct Final Statefeedback.Intermediate steps correctnessexecution_timefeedback.# steps / # expected steps# correctn
model
gpt-3.5-turbo-06130.950.9518.8803881.70000019.020
gpt-3.5-turbo-11060.900.7522.4718571.01245518.020
gpt-4-06130.900.9022.6637811.09375018.020
\n", - "
" - ], "text/plain": [ - " feedback.Correct Final State \\\n", - "model \n", - "gpt-3.5-turbo-0613 0.95 \n", - "gpt-3.5-turbo-1106 0.90 \n", - "gpt-4-0613 0.90 \n", - "\n", - " feedback.Intermediate steps correctness execution_time \\\n", - "model \n", - "gpt-3.5-turbo-0613 0.95 18.880388 \n", - "gpt-3.5-turbo-1106 0.75 22.471857 \n", - "gpt-4-0613 0.90 22.663781 \n", - "\n", - " feedback.# steps / # expected steps # correct n \n", - "model \n", - "gpt-3.5-turbo-0613 1.700000 19.0 20 \n", - "gpt-3.5-turbo-1106 1.012455 18.0 20 \n", - "gpt-4-0613 1.093750 18.0 20 " + "{'question': 'abc',\n", + " 'output': 'abc',\n", + " 'intermediate_steps': [(ToolAgentAction(tool='type_letter', tool_input={'letter': 'a'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'a'}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'function': {'arguments': '{\"letter\": \"a\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 1, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'function': {'arguments': '{\"letter\": \"b\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 2, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'function': {'arguments': '{\"letter\": \"c\"}', 'name': 'type_letter'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-7d6be045-b9e2-4f24-991c-8e34ccd53b98', tool_calls=[{'name': 'type_letter', 'args': {'letter': 'a'}, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80'}, {'name': 'type_letter', 'args': {'letter': 'b'}, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq'}, {'name': 'type_letter', 'args': {'letter': 'c'}, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj'}], tool_call_chunks=[{'name': 'type_letter', 'args': '{\"letter\": \"a\"}', 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'index': 0}, {'name': 'type_letter', 'args': '{\"letter\": \"b\"}', 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'index': 1}, {'name': 'type_letter', 'args': '{\"letter\": \"c\"}', 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'index': 2}])], tool_call_id='call_f4exPQMfz4VWxFJw4LhyMc80'),\n", + " 'OK'),\n", + " (ToolAgentAction(tool='type_letter', tool_input={'letter': 'b'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'b'}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'function': {'arguments': '{\"letter\": \"a\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 1, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'function': {'arguments': '{\"letter\": \"b\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 2, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'function': {'arguments': '{\"letter\": \"c\"}', 'name': 'type_letter'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-7d6be045-b9e2-4f24-991c-8e34ccd53b98', tool_calls=[{'name': 'type_letter', 'args': {'letter': 'a'}, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80'}, {'name': 'type_letter', 'args': {'letter': 'b'}, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq'}, {'name': 'type_letter', 'args': {'letter': 'c'}, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj'}], tool_call_chunks=[{'name': 'type_letter', 'args': '{\"letter\": \"a\"}', 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'index': 0}, {'name': 'type_letter', 'args': '{\"letter\": \"b\"}', 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'index': 1}, {'name': 'type_letter', 'args': '{\"letter\": \"c\"}', 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'index': 2}])], tool_call_id='call_DHOJfLJEKuOKdzBa8ZLRYJZq'),\n", + " 'OK'),\n", + " (ToolAgentAction(tool='type_letter', tool_input={'letter': 'c'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'c'}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'function': {'arguments': '{\"letter\": \"a\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 1, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'function': {'arguments': '{\"letter\": \"b\"}', 'name': 'type_letter'}, 'type': 'function'}, {'index': 2, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'function': {'arguments': '{\"letter\": \"c\"}', 'name': 'type_letter'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-7d6be045-b9e2-4f24-991c-8e34ccd53b98', tool_calls=[{'name': 'type_letter', 'args': {'letter': 'a'}, 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80'}, {'name': 'type_letter', 'args': {'letter': 'b'}, 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq'}, {'name': 'type_letter', 'args': {'letter': 'c'}, 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj'}], tool_call_chunks=[{'name': 'type_letter', 'args': '{\"letter\": \"a\"}', 'id': 'call_f4exPQMfz4VWxFJw4LhyMc80', 'index': 0}, {'name': 'type_letter', 'args': '{\"letter\": \"b\"}', 'id': 'call_DHOJfLJEKuOKdzBa8ZLRYJZq', 'index': 1}, {'name': 'type_letter', 'args': '{\"letter\": \"c\"}', 'id': 'call_EziJvB6jtUEg3CmXSsQ7OWBj', 'index': 2}])], tool_call_id='call_EziJvB6jtUEg3CmXSsQ7OWBj'),\n", + " 'OK')],\n", + " 'state': 'abc'}" ] }, - "execution_count": 20, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "correct_df = (\n", - " df.groupby(\"model\")[\"feedback.Correct Final State\"].sum().to_frame(\"# correct\")\n", - ")\n", - "count_df = df.groupby(\"model\").size().to_frame(\"n\")\n", + "from langchain import globals\n", "\n", - "columns = [\n", - " \"feedback.Correct Final State\",\n", - " \"feedback.Intermediate steps correctness\",\n", - " \"execution_time\",\n", - " \"feedback.# steps / # expected steps\",\n", - "]\n", + "globals.set_verbose(True)\n", "\n", - "df.groupby(\"model\")[columns].mean().join(correct_df).join(count_df)" + "agent = agent_factory()\n", + "agent.invoke({\"question\": \"abc\"})" ] }, { "cell_type": "markdown", - "id": "9a311343-1d5a-433b-9ee3-685de301551d", + "id": "4729e72c-3903-478a-b298-4a586af33912", "metadata": {}, "source": [ - "### Individual" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "e0d57162-1626-4acc-88e1-91d4d4041234", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
input.questionmodeloutputs.statereference.statefeedback.Correct Final Statenum_expected_stepsnum_actual_steps
example_id
89bb564a-ddee-4a36-8a3d-d093eef415caagpt-3.5-turbo-0613aaaaaaaaaaaaaaaa0.0115
89bb564a-ddee-4a36-8a3d-d093eef415caagpt-3.5-turbo-1106aa1.011
89bb564a-ddee-4a36-8a3d-d093eef415caagpt-4-0613abca0.013
5b40cb96-ae09-438e-b940-d24445bb5d67aagpt-3.5-turbo-0613aaaa1.022
5b40cb96-ae09-438e-b940-d24445bb5d67aagpt-3.5-turbo-1106aaaa1.022
\n", - "
" - ], - "text/plain": [ - " input.question model \\\n", - "example_id \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca a gpt-3.5-turbo-0613 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca a gpt-3.5-turbo-1106 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca a gpt-4-0613 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 aa gpt-3.5-turbo-0613 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 aa gpt-3.5-turbo-1106 \n", - "\n", - " outputs.state reference.state \\\n", - "example_id \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca aaaaaaaaaaaaaaa a \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca a a \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca abc a \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 aa aa \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 aa aa \n", - "\n", - " feedback.Correct Final State \\\n", - "example_id \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 0.0 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 1.0 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 0.0 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 1.0 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 1.0 \n", - "\n", - " num_expected_steps num_actual_steps \n", - "example_id \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 1 15 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 1 1 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 1 3 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 2 2 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 2 2 " - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "columns = [\n", - " \"input.question\",\n", - " \"model\",\n", - " \"outputs.state\",\n", - " \"reference.state\",\n", - " \"feedback.Correct Final State\",\n", - " \"num_expected_steps\",\n", - " \"num_actual_steps\",\n", - "]\n", - "df[columns].sort_values(by=[\"input.question\", \"model\"]).head()" + "## Benchmarking\n", + "\n", + "See `introduction` and `benchmark all` for information on how to run benchmarks. This notebook is just to here to explain and explore the task." ] }, { "cell_type": "code", - "execution_count": 23, - "id": "7201d880-d338-40c4-a042-7d5c549cf77a", - "metadata": { - "tags": [ - "remove-cell" - ] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
input.questionmodeloutputs.statereference.statefeedback.Correct Final Statenum_expected_stepsnum_actual_steps
example_id
89bb564a-ddee-4a36-8a3d-d093eef415caagpt-3.5-turbo-0613aaaaaaaaaaaaaaaa0.0115
89bb564a-ddee-4a36-8a3d-d093eef415caagpt-3.5-turbo-1106aa1.011
89bb564a-ddee-4a36-8a3d-d093eef415caagpt-4-0613abca0.013
5b40cb96-ae09-438e-b940-d24445bb5d67aagpt-3.5-turbo-0613aaaa1.022
5b40cb96-ae09-438e-b940-d24445bb5d67aagpt-3.5-turbo-1106aaaa1.022
5b40cb96-ae09-438e-b940-d24445bb5d67aagpt-4-0613aaaa1.022
288d6483-c618-4e34-9b86-275b490e0975aaagpt-3.5-turbo-0613aaaaaa1.033
288d6483-c618-4e34-9b86-275b490e0975aaagpt-3.5-turbo-1106aaaaaa1.033
288d6483-c618-4e34-9b86-275b490e0975aaagpt-4-0613aaaaaa1.033
915bd4b5-a536-4849-8cb6-8a658407c2c9aaaagpt-3.5-turbo-0613aaaaaaaa1.044
915bd4b5-a536-4849-8cb6-8a658407c2c9aaaagpt-3.5-turbo-1106aaaaaaaa1.044
915bd4b5-a536-4849-8cb6-8a658407c2c9aaaagpt-4-0613aaaaaaaa1.044
1cb7a14d-cc7d-44f1-ab47-394f8221abeecatgpt-3.5-turbo-0613catcat1.033
1cb7a14d-cc7d-44f1-ab47-394f8221abeecatgpt-3.5-turbo-1106catcat1.033
1cb7a14d-cc7d-44f1-ab47-394f8221abeecatgpt-4-0613catcat1.033
5b409366-ee6a-4bdb-b842-5e71d3407a05churchgpt-3.5-turbo-0613churchchurch1.066
5b409366-ee6a-4bdb-b842-5e71d3407a05churchgpt-3.5-turbo-1106churchchurch1.067
5b409366-ee6a-4bdb-b842-5e71d3407a05churchgpt-4-0613churchchurch1.066
c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2acommunicationgpt-3.5-turbo-0613communicationcommunication1.01313
c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2acommunicationgpt-3.5-turbo-1106communicationcommunication1.01313
c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2acommunicationgpt-4-0613communicationcommunication1.01313
5cf28d08-a49f-4a69-8759-b1b774ef74b1computergpt-3.5-turbo-0613computercomputer1.088
5cf28d08-a49f-4a69-8759-b1b774ef74b1computergpt-3.5-turbo-1106computercomputer1.089
5cf28d08-a49f-4a69-8759-b1b774ef74b1computergpt-4-0613computercomputer1.088
9017ddcc-d3bd-45a8-88dd-70906964586bdictionarygpt-3.5-turbo-0613dictionarydictionary1.01010
9017ddcc-d3bd-45a8-88dd-70906964586bdictionarygpt-3.5-turbo-1106dictiondictionary0.0107
9017ddcc-d3bd-45a8-88dd-70906964586bdictionarygpt-4-0613dictionarydictionary1.01010
b1ac4715-a0ad-48f2-8741-949ca23b39ebdoggpt-3.5-turbo-0613dogdog1.033
b1ac4715-a0ad-48f2-8741-949ca23b39ebdoggpt-3.5-turbo-1106dogdog1.033
b1ac4715-a0ad-48f2-8741-949ca23b39ebdoggpt-4-0613dogdog1.033
10d42048-ac73-414f-9f50-dba79c3b74a7handgpt-3.5-turbo-0613handhand1.044
10d42048-ac73-414f-9f50-dba79c3b74a7handgpt-3.5-turbo-1106handhand1.044
10d42048-ac73-414f-9f50-dba79c3b74a7handgpt-4-0613handhand1.044
daf06d4f-9b1d-4f5a-8aa9-09f885a79adbheadgpt-3.5-turbo-0613headhead1.044
daf06d4f-9b1d-4f5a-8aa9-09f885a79adbheadgpt-3.5-turbo-1106headhead1.044
daf06d4f-9b1d-4f5a-8aa9-09f885a79adbheadgpt-4-0613headhead1.044
fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3fhorsegpt-3.5-turbo-0613horsehorse1.055
fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3fhorsegpt-3.5-turbo-1106horsehorse1.055
fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3fhorsegpt-4-0613horsehorse1.055
5daad87c-a008-49ab-841c-76916b150f4dhousegpt-3.5-turbo-0613househouse1.055
5daad87c-a008-49ab-841c-76916b150f4dhousegpt-3.5-turbo-1106househouse1.055
5daad87c-a008-49ab-841c-76916b150f4dhousegpt-4-0613househouse1.055
57e29316-e258-4ed9-bbeb-b23c8bcb4bd2informationgpt-3.5-turbo-0613informationinformation1.01111
57e29316-e258-4ed9-bbeb-b23c8bcb4bd2informationgpt-3.5-turbo-1106information!information0.01112
57e29316-e258-4ed9-bbeb-b23c8bcb4bd2informationgpt-4-0613informationinformation1.01111
5ad3a4b3-5207-4a1c-9423-e6cdc3658e49keyboardgpt-3.5-turbo-0613keyboardkeyboard1.088
5ad3a4b3-5207-4a1c-9423-e6cdc3658e49keyboardgpt-3.5-turbo-1106keyboardkeyboard1.088
5ad3a4b3-5207-4a1c-9423-e6cdc3658e49keyboardgpt-4-0613eyboardkeyboard0.087
a4ffa874-b03b-40ed-b360-d17c963ef27eschoolgpt-3.5-turbo-0613schoolschool1.066
a4ffa874-b03b-40ed-b360-d17c963ef27eschoolgpt-3.5-turbo-1106schoolschool1.067
a4ffa874-b03b-40ed-b360-d17c963ef27eschoolgpt-4-0613schoolschool1.066
8a2b5450-dd16-4213-8b70-cb2583d6c7ebstudentgpt-3.5-turbo-0613studentstudent1.077
8a2b5450-dd16-4213-8b70-cb2583d6c7ebstudentgpt-3.5-turbo-1106studentstudent1.077
8a2b5450-dd16-4213-8b70-cb2583d6c7ebstudentgpt-4-0613studentstudent1.077
223f250b-9c33-4aed-adfd-791547b44d3dteachergpt-3.5-turbo-0613teacherteacher1.077
223f250b-9c33-4aed-adfd-791547b44d3dteachergpt-3.5-turbo-1106teacherteacher1.077
223f250b-9c33-4aed-adfd-791547b44d3dteachergpt-4-0613teacherteacher1.077
08a17b26-0b33-4d4f-b7f6-ee44d51f4a4auniversitygpt-3.5-turbo-0613universityuniversity1.01010
08a17b26-0b33-4d4f-b7f6-ee44d51f4a4auniversitygpt-3.5-turbo-1106universityuniversity1.01010
08a17b26-0b33-4d4f-b7f6-ee44d51f4a4auniversitygpt-4-0613universityuniversity1.01010
\n", - "
" - ], - "text/plain": [ - " input.question model \\\n", - "example_id \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca a gpt-3.5-turbo-0613 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca a gpt-3.5-turbo-1106 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca a gpt-4-0613 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 aa gpt-3.5-turbo-0613 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 aa gpt-3.5-turbo-1106 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 aa gpt-4-0613 \n", - "288d6483-c618-4e34-9b86-275b490e0975 aaa gpt-3.5-turbo-0613 \n", - "288d6483-c618-4e34-9b86-275b490e0975 aaa gpt-3.5-turbo-1106 \n", - "288d6483-c618-4e34-9b86-275b490e0975 aaa gpt-4-0613 \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 aaaa gpt-3.5-turbo-0613 \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 aaaa gpt-3.5-turbo-1106 \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 aaaa gpt-4-0613 \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee cat gpt-3.5-turbo-0613 \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee cat gpt-3.5-turbo-1106 \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee cat gpt-4-0613 \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 church gpt-3.5-turbo-0613 \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 church gpt-3.5-turbo-1106 \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 church gpt-4-0613 \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a communication gpt-3.5-turbo-0613 \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a communication gpt-3.5-turbo-1106 \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a communication gpt-4-0613 \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 computer gpt-3.5-turbo-0613 \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 computer gpt-3.5-turbo-1106 \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 computer gpt-4-0613 \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b dictionary gpt-3.5-turbo-0613 \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b dictionary gpt-3.5-turbo-1106 \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b dictionary gpt-4-0613 \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb dog gpt-3.5-turbo-0613 \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb dog gpt-3.5-turbo-1106 \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb dog gpt-4-0613 \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 hand gpt-3.5-turbo-0613 \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 hand gpt-3.5-turbo-1106 \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 hand gpt-4-0613 \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb head gpt-3.5-turbo-0613 \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb head gpt-3.5-turbo-1106 \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb head gpt-4-0613 \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f horse gpt-3.5-turbo-0613 \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f horse gpt-3.5-turbo-1106 \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f horse gpt-4-0613 \n", - "5daad87c-a008-49ab-841c-76916b150f4d house gpt-3.5-turbo-0613 \n", - "5daad87c-a008-49ab-841c-76916b150f4d house gpt-3.5-turbo-1106 \n", - "5daad87c-a008-49ab-841c-76916b150f4d house gpt-4-0613 \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 information gpt-3.5-turbo-0613 \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 information gpt-3.5-turbo-1106 \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 information gpt-4-0613 \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 keyboard gpt-3.5-turbo-0613 \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 keyboard gpt-3.5-turbo-1106 \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 keyboard gpt-4-0613 \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e school gpt-3.5-turbo-0613 \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e school gpt-3.5-turbo-1106 \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e school gpt-4-0613 \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb student gpt-3.5-turbo-0613 \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb student gpt-3.5-turbo-1106 \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb student gpt-4-0613 \n", - "223f250b-9c33-4aed-adfd-791547b44d3d teacher gpt-3.5-turbo-0613 \n", - "223f250b-9c33-4aed-adfd-791547b44d3d teacher gpt-3.5-turbo-1106 \n", - "223f250b-9c33-4aed-adfd-791547b44d3d teacher gpt-4-0613 \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a university gpt-3.5-turbo-0613 \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a university gpt-3.5-turbo-1106 \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a university gpt-4-0613 \n", - "\n", - " outputs.state reference.state \\\n", - "example_id \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca aaaaaaaaaaaaaaa a \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca a a \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca abc a \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 aa aa \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 aa aa \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 aa aa \n", - "288d6483-c618-4e34-9b86-275b490e0975 aaa aaa \n", - "288d6483-c618-4e34-9b86-275b490e0975 aaa aaa \n", - "288d6483-c618-4e34-9b86-275b490e0975 aaa aaa \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 aaaa aaaa \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 aaaa aaaa \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 aaaa aaaa \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee cat cat \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee cat cat \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee cat cat \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 church church \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 church church \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 church church \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a communication communication \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a communication communication \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a communication communication \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 computer computer \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 computer computer \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 computer computer \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b dictionary dictionary \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b diction dictionary \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b dictionary dictionary \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb dog dog \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb dog dog \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb dog dog \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 hand hand \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 hand hand \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 hand hand \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb head head \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb head head \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb head head \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f horse horse \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f horse horse \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f horse horse \n", - "5daad87c-a008-49ab-841c-76916b150f4d house house \n", - "5daad87c-a008-49ab-841c-76916b150f4d house house \n", - "5daad87c-a008-49ab-841c-76916b150f4d house house \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 information information \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 information! information \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 information information \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 keyboard keyboard \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 keyboard keyboard \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 eyboard keyboard \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e school school \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e school school \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e school school \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb student student \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb student student \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb student student \n", - "223f250b-9c33-4aed-adfd-791547b44d3d teacher teacher \n", - "223f250b-9c33-4aed-adfd-791547b44d3d teacher teacher \n", - "223f250b-9c33-4aed-adfd-791547b44d3d teacher teacher \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a university university \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a university university \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a university university \n", - "\n", - " feedback.Correct Final State \\\n", - "example_id \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 0.0 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 1.0 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 0.0 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 1.0 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 1.0 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 1.0 \n", - "288d6483-c618-4e34-9b86-275b490e0975 1.0 \n", - "288d6483-c618-4e34-9b86-275b490e0975 1.0 \n", - "288d6483-c618-4e34-9b86-275b490e0975 1.0 \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 1.0 \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 1.0 \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 1.0 \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee 1.0 \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee 1.0 \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee 1.0 \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 1.0 \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 1.0 \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 1.0 \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a 1.0 \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a 1.0 \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a 1.0 \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 1.0 \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 1.0 \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 1.0 \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b 1.0 \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b 0.0 \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b 1.0 \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb 1.0 \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb 1.0 \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb 1.0 \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 1.0 \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 1.0 \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 1.0 \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb 1.0 \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb 1.0 \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb 1.0 \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f 1.0 \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f 1.0 \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f 1.0 \n", - "5daad87c-a008-49ab-841c-76916b150f4d 1.0 \n", - "5daad87c-a008-49ab-841c-76916b150f4d 1.0 \n", - "5daad87c-a008-49ab-841c-76916b150f4d 1.0 \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 1.0 \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 0.0 \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 1.0 \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 1.0 \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 1.0 \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 0.0 \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e 1.0 \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e 1.0 \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e 1.0 \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb 1.0 \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb 1.0 \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb 1.0 \n", - "223f250b-9c33-4aed-adfd-791547b44d3d 1.0 \n", - "223f250b-9c33-4aed-adfd-791547b44d3d 1.0 \n", - "223f250b-9c33-4aed-adfd-791547b44d3d 1.0 \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a 1.0 \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a 1.0 \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a 1.0 \n", - "\n", - " num_expected_steps num_actual_steps \n", - "example_id \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 1 15 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 1 1 \n", - "89bb564a-ddee-4a36-8a3d-d093eef415ca 1 3 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 2 2 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 2 2 \n", - "5b40cb96-ae09-438e-b940-d24445bb5d67 2 2 \n", - "288d6483-c618-4e34-9b86-275b490e0975 3 3 \n", - "288d6483-c618-4e34-9b86-275b490e0975 3 3 \n", - "288d6483-c618-4e34-9b86-275b490e0975 3 3 \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 4 4 \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 4 4 \n", - "915bd4b5-a536-4849-8cb6-8a658407c2c9 4 4 \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee 3 3 \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee 3 3 \n", - "1cb7a14d-cc7d-44f1-ab47-394f8221abee 3 3 \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 6 6 \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 6 7 \n", - "5b409366-ee6a-4bdb-b842-5e71d3407a05 6 6 \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a 13 13 \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a 13 13 \n", - "c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a 13 13 \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 8 8 \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 8 9 \n", - "5cf28d08-a49f-4a69-8759-b1b774ef74b1 8 8 \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b 10 10 \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b 10 7 \n", - "9017ddcc-d3bd-45a8-88dd-70906964586b 10 10 \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb 3 3 \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb 3 3 \n", - "b1ac4715-a0ad-48f2-8741-949ca23b39eb 3 3 \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 4 4 \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 4 4 \n", - "10d42048-ac73-414f-9f50-dba79c3b74a7 4 4 \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb 4 4 \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb 4 4 \n", - "daf06d4f-9b1d-4f5a-8aa9-09f885a79adb 4 4 \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f 5 5 \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f 5 5 \n", - "fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f 5 5 \n", - "5daad87c-a008-49ab-841c-76916b150f4d 5 5 \n", - "5daad87c-a008-49ab-841c-76916b150f4d 5 5 \n", - "5daad87c-a008-49ab-841c-76916b150f4d 5 5 \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 11 11 \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 11 12 \n", - "57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 11 11 \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 8 8 \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 8 8 \n", - "5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 8 7 \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e 6 6 \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e 6 7 \n", - "a4ffa874-b03b-40ed-b360-d17c963ef27e 6 6 \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb 7 7 \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb 7 7 \n", - "8a2b5450-dd16-4213-8b70-cb2583d6c7eb 7 7 \n", - "223f250b-9c33-4aed-adfd-791547b44d3d 7 7 \n", - "223f250b-9c33-4aed-adfd-791547b44d3d 7 7 \n", - "223f250b-9c33-4aed-adfd-791547b44d3d 7 7 \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a 10 10 \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a 10 10 \n", - "08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a 10 10 " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[columns].sort_values(by=[\"input.question\", \"model\"])" - ] + "execution_count": null, + "id": "87055296-62e1-4fa9-8868-5c213f4ea2e6", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs/source/notebooks/tool_usage/typewriter_26.ipynb b/docs/source/notebooks/tool_usage/typewriter_26.ipynb index cb90bf2..37f3ea8 100644 --- a/docs/source/notebooks/tool_usage/typewriter_26.ipynb +++ b/docs/source/notebooks/tool_usage/typewriter_26.ipynb @@ -71,7 +71,7 @@ "" ], "text/plain": [ - "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})" + "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})" ] }, "execution_count": 2, @@ -106,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "61535a75-24f6-4727-9549-f76c263e9153", "metadata": { "tags": [] @@ -118,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "f35a0a1d-5a1e-4de1-8d8c-c7c9a264a6c7", "metadata": { "tags": [] @@ -127,14 +127,14 @@ { "data": { "text/plain": [ - "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=, func=.func at 0x7f6cd20e6520>),\n", - " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=, func=.func at 0x7f6cd20e65c0>),\n", - " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=, func=.func at 0x7f6cd20e6660>),\n", - " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=, func=.func at 0x7f6cd20e6700>),\n", - " StructuredTool(name='e', description='e() -> str - Run to Type the letter \"e\".', args_schema=, func=.func at 0x7f6cd20e67a0>)]" + "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=, func=.func at 0x75aa9defc180>),\n", + " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=, func=.func at 0x75aa9defc220>),\n", + " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=, func=.func at 0x75aa9defc2c0>),\n", + " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=, func=.func at 0x75aa9defc360>),\n", + " StructuredTool(name='e', description='e() -> str - Run to Type the letter \"e\".', args_schema=, func=.func at 0x75aa9defc400>)]" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -145,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "5bea0190-39ec-4f30-9a00-90136bc6bf0b", "metadata": { "tags": [] @@ -157,7 +157,7 @@ "'OK'" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -168,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "bf7444da-15a1-455a-b22e-639cbfff8432", "metadata": { "tags": [] @@ -180,7 +180,7 @@ "'OK'" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -191,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "d12bd710-5c01-4539-a4b9-afbf03164923", "metadata": { "tags": [] @@ -203,7 +203,7 @@ "'ad'" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -217,105 +217,110 @@ "id": "f1d62a13-3771-460f-b131-4443f669ca3d", "metadata": {}, "source": [ - "## Agent Factory\n", + "## Explore the task\n", "\n", "For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n", "\n", - "We'll use an `OpenAIAgentFactory` provided with LangChain Benchmarks -- look at the `intro` section to see how to define your own." + "We'll use the `StandardAgentFactory` -- look at the `intro` for more information about what it does and/or how to create a custom one." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "6142cf4e-862c-47a3-aa75-81d7d3231308", "metadata": { "tags": [] }, + "outputs": [], + "source": [ + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_openai.chat_models import ChatOpenAI\n", + "\n", + "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n", + "\n", + "model = ChatOpenAI(temperature=0)\n", + "prompt = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", \"{instructions}\"), # Populated from task.instructions automatically\n", + " (\"human\", \"{question}\"), # Populated from the test data\n", + " (\n", + " \"placeholder\",\n", + " \"{agent_scratchpad}\",\n", + " ), # Work where the agent can do its work (e.g., call multiple tools)\n", + " ]\n", + ")\n", + "\n", + "agent_factory = StandardAgentFactory(task, model, prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432", + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m\n", + "Invoking: `a` with `{}`\n", + "\n", + "\n", + "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `b` with `{}`\n", + "\n", + "\n", + "\u001b[0m\u001b[33;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `c` with `{}`\n", + "\n", + "\n", + "\u001b[0m\u001b[38;5;200m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3mabcabcabc\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, { "data": { "text/plain": [ - "{'input': 'hello',\n", - " 'output': 'hello\\nhello',\n", - " 'intermediate_steps': [(AgentActionMessageLog(tool='h', tool_input={}, log='\\nInvoking: `h` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'h', 'arguments': ''}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='e', tool_input={}, log='\\nInvoking: `e` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'e', 'arguments': ''}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='l', tool_input={}, log='\\nInvoking: `l` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'l', 'arguments': ''}})]),\n", + "{'question': 'abc',\n", + " 'output': 'abcabcabc',\n", + " 'intermediate_steps': [(ToolAgentAction(tool='a', tool_input={}, log='\\nInvoking: `a` with `{}`\\n\\n\\n', message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'function': {'arguments': '{}', 'name': 'a'}, 'type': 'function'}, {'index': 1, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'function': {'arguments': '{}', 'name': 'b'}, 'type': 'function'}, {'index': 2, 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'function': {'arguments': '{}', 'name': 'c'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9a1af767-29e4-4759-ab28-5b29236e8f22', tool_calls=[{'name': 'a', 'args': {}, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI'}, {'name': 'b', 'args': {}, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW'}, {'name': 'c', 'args': {}, 'id': 'call_MRAOAgbi8vT445clqC8OybMR'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'index': 2}])], tool_call_id='call_OrpjShN5uNzw2Rsb1tWF6swI'),\n", " 'OK'),\n", - " (AgentActionMessageLog(tool='l', tool_input={}, log='\\nInvoking: `l` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'l', 'arguments': ''}})]),\n", + " (ToolAgentAction(tool='b', tool_input={}, log='\\nInvoking: `b` with `{}`\\n\\n\\n', message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'function': {'arguments': '{}', 'name': 'a'}, 'type': 'function'}, {'index': 1, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'function': {'arguments': '{}', 'name': 'b'}, 'type': 'function'}, {'index': 2, 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'function': {'arguments': '{}', 'name': 'c'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9a1af767-29e4-4759-ab28-5b29236e8f22', tool_calls=[{'name': 'a', 'args': {}, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI'}, {'name': 'b', 'args': {}, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW'}, {'name': 'c', 'args': {}, 'id': 'call_MRAOAgbi8vT445clqC8OybMR'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'index': 2}])], tool_call_id='call_2XO5RNgt9FjGvTXztgD0tKqW'),\n", " 'OK'),\n", - " (AgentActionMessageLog(tool='o', tool_input={}, log='\\nInvoking: `o` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'o', 'arguments': ''}})]),\n", + " (ToolAgentAction(tool='c', tool_input={}, log='\\nInvoking: `c` with `{}`\\n\\n\\n', message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'function': {'arguments': '{}', 'name': 'a'}, 'type': 'function'}, {'index': 1, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'function': {'arguments': '{}', 'name': 'b'}, 'type': 'function'}, {'index': 2, 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'function': {'arguments': '{}', 'name': 'c'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9a1af767-29e4-4759-ab28-5b29236e8f22', tool_calls=[{'name': 'a', 'args': {}, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI'}, {'name': 'b', 'args': {}, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW'}, {'name': 'c', 'args': {}, 'id': 'call_MRAOAgbi8vT445clqC8OybMR'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'index': 2}])], tool_call_id='call_MRAOAgbi8vT445clqC8OybMR'),\n", " 'OK')],\n", - " 'state': 'hello'}" + " 'state': 'abc'}" ] }, - "execution_count": 9, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from langchain_benchmarks.tool_usage import agents\n", + "from langchain import globals\n", "\n", - "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n", + "globals.set_verbose(True)\n", "\n", - "# Let's test that our agent works\n", "agent = agent_factory()\n", - "agent.invoke({\"question\": \"hello\"})" + "agent.invoke({\"question\": \"abc\"})" ] }, { "cell_type": "markdown", - "id": "3821e4b0-8e67-418a-840c-470fcde42df0", + "id": "89124d06-41f7-4432-9f2e-542c0d85e2e5", "metadata": {}, "source": [ - "## Eval\n", - "\n", - "Let's evaluate an agent now." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import uuid\n", - "\n", - "from langsmith.client import Client\n", - "\n", - "from langchain_benchmarks import clone_public_dataset\n", - "\n", - "# Clone the dataset\n", - "clone_public_dataset(task.dataset_id, dataset_name=task.name)\n", - "\n", - "experiment_uuid = uuid.uuid4().hex[:4]\n", - "\n", - "client = Client()\n", - "\n", - "models = [\"gpt-3.5-turbo-16k\"]\n", + "## Benchmarking\n", "\n", - "for model in models:\n", - " print()\n", - " agent_factory = agents.OpenAIAgentFactory(task, model=model)\n", - " test_run = client.run_on_dataset(\n", - " dataset_name=task.name,\n", - " llm_or_chain_factory=agent_factory,\n", - " evaluation=task.get_eval_config(),\n", - " verbose=False,\n", - " concurrency_level=1,\n", - " project_name=f\"typewriter-26-{model}-{experiment_uuid}\",\n", - " tags=[model],\n", - " project_metadata={\n", - " \"model\": model,\n", - " \"arch\": \"openai-functions-agent\",\n", - " \"id\": experiment_uuid,\n", - " },\n", - " )" + "See `introduction` and `benchmark all` for information on how to run benchmarks. This notebook is just to here to explain and explore the task." ] } ], diff --git a/langchain_benchmarks/tool_usage/agents/adapters.py b/langchain_benchmarks/tool_usage/agents/adapters.py index be060f8..b5ecbe4 100644 --- a/langchain_benchmarks/tool_usage/agents/adapters.py +++ b/langchain_benchmarks/tool_usage/agents/adapters.py @@ -41,27 +41,8 @@ def _read_state(*args: Any, **kwargs: Any) -> Any: else: return None - def _format_input(inputs: dict) -> dict: - """Make sure that the input is always called `input`.""" - - if "question" not in inputs: - raise ValueError( - "Expected 'question' to be in the inputs. Found only the following " - f"keys {sorted(inputs.keys())}." - ) - - inputs = inputs.copy() # Because 'question' is popped below - - if "input" not in inputs: - return {"input": inputs.pop("question"), **inputs} - return inputs - - runnable = ( - RunnableLambda(_format_input).with_config({"run_name": "Format Input"}) - | agent_executor - | RunnableLambda(_ensure_output_exists).with_config( - {"run_name": "Ensure Output"} - ) + runnable = agent_executor | RunnableLambda(_ensure_output_exists).with_config( + {"run_name": "Ensure Output"} ) if state_reader is not None: