From 022fbe8a5deda86ddef3de5f1f4eb87054bfbc36 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Thu, 18 Apr 2024 09:50:27 -0400 Subject: [PATCH 1/3] x --- docs/source/notebooks/tool_usage/intro.ipynb | 120 +++++++++++++++---- 1 file changed, 98 insertions(+), 22 deletions(-) diff --git a/docs/source/notebooks/tool_usage/intro.ipynb b/docs/source/notebooks/tool_usage/intro.ipynb index 06d89d9..514ff3c 100644 --- a/docs/source/notebooks/tool_usage/intro.ipynb +++ b/docs/source/notebooks/tool_usage/intro.ipynb @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "3b9b82fc-b689-4a25-b718-99ecc2fc6867", "metadata": { "tags": [] @@ -136,19 +136,21 @@ "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n", "\n", "Success is measured by the ability to answer the question correctly, and efficiently. \n", - "Multiverse Math ToolUsageTask594f9f60-30a0-49bf-b075-f44beabf546aAn environment that contains a few basic math operations, but with altered results.\n", + "Multiverse Math ToolUsageTask47ed57bc-e852-4f84-a23e-cce4793864e9An environment that contains a few basic math operations, but with altered results.\n", "\n", "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n", "\n", - "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math. \n", + "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\n", + "\n", + "This task is associated with 20 test examples. \n", "\n", "" ], "text/plain": [ - "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\", eval_params={}), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math'})])" + "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\", eval_params={}), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n\\nThis task is associated with 20 test examples.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math_without_question'})])" ] }, - "execution_count": 2, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -169,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "7543739b-d212-4249-9b4a-fc406a58c9c7", "metadata": { "tags": [] @@ -198,10 +200,10 @@ "" ], "text/plain": [ - "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})" + "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -248,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "f201dbbe-7d92-4bc7-b4b5-ea8901dd2970", "metadata": { "tags": [] @@ -257,13 +259,13 @@ { "data": { "text/plain": [ - "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=, func=.func at 0x1277c18a0>),\n", - " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=, func=.func at 0x1277c13a0>),\n", - " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=, func=.func at 0x1277c19e0>),\n", - " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=, func=.func at 0x1277c1800>)]" + "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=, func=.func at 0x74e8b68b0040>),\n", + " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=, func=.func at 0x74e8b68b00e0>),\n", + " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=, func=.func at 0x74e8b68b0180>),\n", + " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=, func=.func at 0x74e8b68b0220>)]" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -275,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "b07957ee-ae52-47d4-a4ff-aa99d4d9bdaf", "metadata": { "tags": [] @@ -287,7 +289,7 @@ "'OK'" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -300,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "40fbb9b6-00f6-4445-b480-00eed6b5b3aa", "metadata": { "tags": [] @@ -312,7 +314,7 @@ "'aac'" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -326,11 +328,85 @@ "id": "8d39b9b3-d4da-49bc-b3db-8a4165b1db55", "metadata": {}, "source": [ - "## Creating an agent\n", + "## Agent Factory\n", + "\n", + "Now that you know how the test environment works, it's time to define an agent!\n", + "\n", + "Let's create an agent using LangChain's [create_tool_calling_agent](https://python.langchain.com/docs/modules/agents/agent_types/tool_calling/) functionality.\n", + "\n", + "Here's a list of [chat models that support tool calling](https://python.langchain.com/docs/integrations/chat/)." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "2cbcbc98-2e4d-4ba4-9732-7fba340d247a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents import create_tool_calling_agent\n", + "from langchain_anthropic import ChatAnthropic\n", + "from langchain_core.prompts import ChatPromptTemplate\n", "\n", - "So now that you know how the test environment works, it's time to define an agent! \n", + "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n", "\n", - "We will follow the example in the LangChain documentation to [define an OpenAI tool using agent](https://python.langchain.com/docs/modules/agents/). " + "model = ChatAnthropic(model=\"claude-3-opus-20240229\", temperature=0)\n", + "prompt = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", \"{instructions}\"),\n", + " (\"human\", \"{input}\"), # Populated from task.instructions automatically\n", + " (\"placeholder\", \"{agent_scratchpad}\"),\n", + " ]\n", + ")\n", + "\n", + "# Construct the Tools agent\n", + "agent_factory = StandardAgentFactory(task, model, prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "6387e505-58ef-4c6d-aaa8-fc60db064120", + "metadata": {}, + "outputs": [], + "source": [ + "agent = agent_factory()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "de03f603-959a-430d-857d-0ef6b59ac50c", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'agent_definition' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[19], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_benchmarks\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtool_usage\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01magents\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CustomRunnableAgentFactory\n\u001b[0;32m----> 3\u001b[0m agent_factory \u001b[38;5;241m=\u001b[39m CustomRunnableAgentFactory(task, agent\u001b[38;5;241m=\u001b[39m\u001b[43magent_definition\u001b[49m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'agent_definition' is not defined" + ] + } + ], + "source": [ + "from langchain_benchmarks.tool_usage.agents import CustomRunnableAgentFactory\n", + "\n", + "agent_factory = CustomRunnableAgentFactory(task, agent=agent_definition)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4debf19-2b85-4036-b56b-edd1c52e52cd", + "metadata": {}, + "outputs": [], + "source": [ + "# agent_factory = StandardAgentFactory(\n", + "# task, model, prompt, rate_limiter=rate_limiter\n", + "# )\n" ] }, { @@ -776,7 +852,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.2" + "version": "3.11.4" } }, "nbformat": 4, From 5b4287037c39a0e0a7b2f592b5281b595fc036c7 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Thu, 18 Apr 2024 10:44:47 -0400 Subject: [PATCH 2/3] x --- docs/source/notebooks/tool_usage/intro.ipynb | 506 ++++++++----------- 1 file changed, 223 insertions(+), 283 deletions(-) diff --git a/docs/source/notebooks/tool_usage/intro.ipynb b/docs/source/notebooks/tool_usage/intro.ipynb index 514ff3c..a0fd93b 100644 --- a/docs/source/notebooks/tool_usage/intro.ipynb +++ b/docs/source/notebooks/tool_usage/intro.ipynb @@ -147,7 +147,7 @@ "" ], "text/plain": [ - "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\", eval_params={}), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n\\nThis task is associated with 20 test examples.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math_without_question'})])" + "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\", eval_params={}), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n\\nThis task is associated with 20 test examples.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math_without_question'})])" ] }, "execution_count": 1, @@ -200,7 +200,7 @@ "" ], "text/plain": [ - "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})" + "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})" ] }, "execution_count": 2, @@ -259,10 +259,10 @@ { "data": { "text/plain": [ - "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=, func=.func at 0x74e8b68b0040>),\n", - " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=, func=.func at 0x74e8b68b00e0>),\n", - " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=, func=.func at 0x74e8b68b0180>),\n", - " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=, func=.func at 0x74e8b68b0220>)]" + "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=, func=.func at 0x78972c6fc7c0>),\n", + " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=, func=.func at 0x78972c6fc220>),\n", + " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=, func=.func at 0x78972c6fc900>),\n", + " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=, func=.func at 0x78972c6fc9a0>)]" ] }, "execution_count": 3, @@ -328,24 +328,25 @@ "id": "8d39b9b3-d4da-49bc-b3db-8a4165b1db55", "metadata": {}, "source": [ - "## Agent Factory\n", + "## Create an Agent!\n", "\n", - "Now that you know how the test environment works, it's time to define an agent!\n", + "Now that you know how the test environment works, let's create an agent that we can test!\n", "\n", - "Let's create an agent using LangChain's [create_tool_calling_agent](https://python.langchain.com/docs/modules/agents/agent_types/tool_calling/) functionality.\n", + "Because an agent interacts with the environment via tools and can change the state of the environment during the course of an agent run, what we actually want is the ability to create a fresh agent and a fresh environment for each test run.\n", "\n", - "Here's a list of [chat models that support tool calling](https://python.langchain.com/docs/integrations/chat/)." + "We'll do this using a factory. A factory is just a fancy name in computer science for an object that can create other objects. In this case, we'll have an Agent Factory that we can call and it'll create a fresh agent for us on each call.\n", + "\n", + "We'll use the StandardAgentFactory which under the hood creates a standard LangChain [tool calling agent](https://python.langchain.com/docs/modules/agents/agent_types/tool_calling/). It can be used with any [Chat Model that support tool calling](https://python.langchain.com/docs/integrations/chat/)." ] }, { "cell_type": "code", - "execution_count": 26, - "id": "2cbcbc98-2e4d-4ba4-9732-7fba340d247a", + "execution_count": 6, + "id": "db65c253-7710-4c7b-b968-0662ec089030", "metadata": {}, "outputs": [], "source": [ - "from langchain.agents import create_tool_calling_agent\n", - "from langchain_anthropic import ChatAnthropic\n", + "from langchain_anthropic.chat_models import ChatAnthropic\n", "from langchain_core.prompts import ChatPromptTemplate\n", "\n", "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n", @@ -353,186 +354,91 @@ "model = ChatAnthropic(model=\"claude-3-opus-20240229\", temperature=0)\n", "prompt = ChatPromptTemplate.from_messages(\n", " [\n", - " (\"system\", \"{instructions}\"),\n", - " (\"human\", \"{input}\"), # Populated from task.instructions automatically\n", + " (\"system\", \"{instructions}\"), # Populated from task.instructions automatically\n", + " (\"human\", \"{question}\"),\n", " (\"placeholder\", \"{agent_scratchpad}\"),\n", " ]\n", ")\n", "\n", - "# Construct the Tools agent\n", "agent_factory = StandardAgentFactory(task, model, prompt)" ] }, { - "cell_type": "code", - "execution_count": 27, - "id": "6387e505-58ef-4c6d-aaa8-fc60db064120", - "metadata": {}, - "outputs": [], - "source": [ - "agent = agent_factory()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "de03f603-959a-430d-857d-0ef6b59ac50c", + "cell_type": "markdown", + "id": "f0f7a17d-9afa-4ce1-b00d-e0f3bcd66862", "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'agent_definition' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[19], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_benchmarks\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtool_usage\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01magents\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CustomRunnableAgentFactory\n\u001b[0;32m----> 3\u001b[0m agent_factory \u001b[38;5;241m=\u001b[39m CustomRunnableAgentFactory(task, agent\u001b[38;5;241m=\u001b[39m\u001b[43magent_definition\u001b[49m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'agent_definition' is not defined" - ] - } - ], "source": [ - "from langchain_benchmarks.tool_usage.agents import CustomRunnableAgentFactory\n", - "\n", - "agent_factory = CustomRunnableAgentFactory(task, agent=agent_definition)" + "Let's test it out!" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b4debf19-2b85-4036-b56b-edd1c52e52cd", + "execution_count": 7, + "id": "6387e505-58ef-4c6d-aaa8-fc60db064120", "metadata": {}, "outputs": [], "source": [ - "# agent_factory = StandardAgentFactory(\n", - "# task, model, prompt, rate_limiter=rate_limiter\n", - "# )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "8827186a-8ed3-43c7-956c-71342e0a7bf2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.agents.format_scratchpad.openai_tools import (\n", - " format_to_openai_tool_messages,\n", - ")\n", - "from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser\n", - "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n", - "from langchain.tools.render import (\n", - " format_tool_to_openai_function,\n", - " format_tool_to_openai_tool,\n", - ")\n", - "from langchain_community.chat_models import ChatOpenAI\n", - "from langchain_core.runnables import RunnableParallel\n", - "\n", - "tools = task.create_environment().tools\n", - "formatted_tools = [format_tool_to_openai_tool(t) for t in tools]\n", - "llm = ChatOpenAI(model=\"gpt-3.5-turbo-1106\", temperature=0, model_kwargs={\"seed\": 42})\n", - "# Compose the llm call with the tools' JSON schemas\n", - "llm_with_tools = llm.bind(tools=formatted_tools)\n", - "format_inputs = RunnableParallel(\n", - " {\n", - " \"input\": lambda x: x[\"input\"],\n", - " \"agent_scratchpad\": lambda x: format_to_openai_tool_messages(\n", - " x[\"intermediate_steps\"]\n", - " ),\n", - " }\n", - ")\n", - "\n", - "prompt = ChatPromptTemplate.from_messages(\n", - " [\n", - " (\n", - " \"system\",\n", - " \"You are very powerful assistant, but bad at calculating lengths of words.\",\n", - " ),\n", - " (\"user\", \"{input}\"),\n", - " MessagesPlaceholder(variable_name=\"agent_scratchpad\"),\n", - " ]\n", - ")\n", - "agent_definition = (\n", - " # Input to this pipeline is a dictionary with \"input\" and \"intermediate_steps\" keys\n", - " format_inputs | prompt | llm_with_tools | OpenAIToolsAgentOutputParser()\n", - ")" + "agent = agent_factory()" ] }, { "cell_type": "markdown", - "id": "7614ab73-dc66-4f2e-9eeb-ff1711c113d0", + "id": "5c99a9bd-fa3e-4401-9062-77dbcff30d5c", "metadata": {}, "source": [ - "### Agent Factory\n", - "\n", - "As discussed above, each test environment tracks state. We want to create a new environment for each data point to avoid cross-contamination between rows in the dataset.\n", - "\n", - "We do this by defining an agent factory. Below, we integrate our agent into a `CustomRunnableAgentFactory`, which helps create the environment and agent executor for each data point." + "Here, were the instructions for the task" ] }, { "cell_type": "code", - "execution_count": 27, - "id": "629416b3-b5d6-45ad-9bda-4f0642a0eb13", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain_benchmarks.tool_usage.agents import CustomRunnableAgentFactory\n", - "\n", - "agent_factory = CustomRunnableAgentFactory(task, agent=agent_definition)" - ] - }, - { - "cell_type": "markdown", - "id": "7f06cf25-6766-4ea5-a566-36af045bdcf4", + "execution_count": 8, + "id": "8e1f0a3d-fed6-41f7-8825-08787a57ad98", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\"" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Let's check that the agent works" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "755f7920-831b-4595-8c6d-cca22c935198", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain import globals\n", - "\n", - "globals.set_verbose(True)" + "task.instructions" ] }, { "cell_type": "code", - "execution_count": null, - "id": "c2804eae-5b0b-4a38-9dff-363a4fe8f324", - "metadata": { - "tags": [] - }, - "outputs": [], + "execution_count": 9, + "id": "ce67d619-fa99-4c15-bc53-3fb08b40a201", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'abc',\n", + " 'output': [],\n", + " 'intermediate_steps': [(ToolAgentAction(tool='a', tool_input={}, log='\\nInvoking: `a` with `{}`\\nresponded: [{\\'text\\': \\'\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_01JGUjFmZFmwmimu9oYyxZHM\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_011cJHuuY68BV8DbSRRURa5B\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_01JGUjFmZFmwmimu9oYyxZHM', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_011cJHuuY68BV8DbSRRURa5B', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-ef7ffa5b-3e54-43fd-bb4b-89f17052c15e', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_01JGUjFmZFmwmimu9oYyxZHM'}, {'name': 'b', 'args': {}, 'id': 'toolu_011cJHuuY68BV8DbSRRURa5B'}, {'name': 'c', 'args': {}, 'id': 'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_01JGUjFmZFmwmimu9oYyxZHM', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_011cJHuuY68BV8DbSRRURa5B', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng', 'index': 2}])], tool_call_id='toolu_01JGUjFmZFmwmimu9oYyxZHM'),\n", + " 'OK'),\n", + " (ToolAgentAction(tool='b', tool_input={}, log='\\nInvoking: `b` with `{}`\\nresponded: [{\\'text\\': \\'\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_01JGUjFmZFmwmimu9oYyxZHM\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_011cJHuuY68BV8DbSRRURa5B\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_01JGUjFmZFmwmimu9oYyxZHM', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_011cJHuuY68BV8DbSRRURa5B', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-ef7ffa5b-3e54-43fd-bb4b-89f17052c15e', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_01JGUjFmZFmwmimu9oYyxZHM'}, {'name': 'b', 'args': {}, 'id': 'toolu_011cJHuuY68BV8DbSRRURa5B'}, {'name': 'c', 'args': {}, 'id': 'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_01JGUjFmZFmwmimu9oYyxZHM', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_011cJHuuY68BV8DbSRRURa5B', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng', 'index': 2}])], tool_call_id='toolu_011cJHuuY68BV8DbSRRURa5B'),\n", + " 'OK'),\n", + " (ToolAgentAction(tool='c', tool_input={}, log='\\nInvoking: `c` with `{}`\\nresponded: [{\\'text\\': \\'\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_01JGUjFmZFmwmimu9oYyxZHM\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_011cJHuuY68BV8DbSRRURa5B\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_01JGUjFmZFmwmimu9oYyxZHM', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_011cJHuuY68BV8DbSRRURa5B', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-ef7ffa5b-3e54-43fd-bb4b-89f17052c15e', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_01JGUjFmZFmwmimu9oYyxZHM'}, {'name': 'b', 'args': {}, 'id': 'toolu_011cJHuuY68BV8DbSRRURa5B'}, {'name': 'c', 'args': {}, 'id': 'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_01JGUjFmZFmwmimu9oYyxZHM', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_011cJHuuY68BV8DbSRRURa5B', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01UD6ZkfaUXE8tqmtoPCo4Ng', 'index': 2}])], tool_call_id='toolu_01UD6ZkfaUXE8tqmtoPCo4Ng'),\n", + " 'OK')],\n", + " 'state': 'abc'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "agent = agent_factory()\n", "agent.invoke({\"question\": \"abc\"})" ] }, - { - "cell_type": "code", - "execution_count": 12, - "id": "2aa68a11-d268-4868-a862-309801201989", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "globals.set_verbose(False)" - ] - }, { "cell_type": "markdown", "id": "e3bce984-7c9c-4f6e-a51b-01c3e2b6e00a", @@ -566,7 +472,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "id": "c88bd6e1-f77e-4668-a143-096929e897ee", "metadata": { "tags": [] @@ -575,10 +481,10 @@ { "data": { "text/plain": [ - "RunEvalConfig(evaluators=[], custom_evaluators=[], reference_key=None, prediction_key=None, input_key=None, eval_llm=None)" + "RunEvalConfig(evaluators=[], custom_evaluators=[], batch_evaluators=None, reference_key=None, prediction_key=None, input_key=None, eval_llm=None)" ] }, - "execution_count": 13, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -598,14 +504,13 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "60466447-eb37-4204-a497-fe47e8d8dd70", - "metadata": { - "tags": [] - }, + "execution_count": 11, + "id": "0770b442-f96a-4670-a4f7-3093f24fb64b", + "metadata": {}, "outputs": [], "source": [ "import datetime\n", + "import uuid\n", "\n", "from langsmith.client import Client\n", "\n", @@ -615,110 +520,79 @@ " model_registry,\n", " registry,\n", ")\n", - "from langchain_benchmarks.rate_limiting import RateLimiter\n", - "from langchain_benchmarks.tool_usage.agents import (\n", - " AnthropicToolUserFactory,\n", - " CustomAgentFactory,\n", - " CustomRunnableAgentFactory,\n", - " OpenAIAgentFactory,\n", - " OpenAIAssistantFactory,\n", - ")" + "from langchain_benchmarks.rate_limiting import RateLimiter" ] }, { - "cell_type": "code", - "execution_count": 15, - "id": "c448d139-9923-4cf6-af49-cbf3dff46bdc", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "15cbded4-5ab5-4b9b-9e88-77b24d3b750c", + "metadata": {}, "source": [ - "import uuid\n", - "\n", - "experiment_uuid = uuid.uuid4().hex[:]" + "Create an experiment ID. we'll use it to tag our runs, which we can later use to retrieve run data from LangSmith." ] }, { "cell_type": "code", - "execution_count": 16, - "id": "200df769-4dd9-453b-8500-219c1d5305f6", - "metadata": { - "tags": [] - }, + "execution_count": 12, + "id": "c23208e3-01d1-4e83-9e4a-59544828f6f5", + "metadata": {}, "outputs": [], "source": [ - "tests = [\n", - " # 2-tuple of (architecture, model name)\n", - " (\"openai_functions\", \"gpt-3.5-turbo-1106\"), # Requires OpenAI Creds\n", - " (\"openai_functions\", \"gpt-3.5-turbo-0613\"),\n", - " (\"openai_functions\", \"gpt-4-1106-preview\"),\n", - " (\"openai_functions\", \"gpt-4-0613\"),\n", - " (\"openai_functions\", \"mistral-7b-instruct-v0.1\"), # Requires AnyScale creds\n", - " # Requires Anthropic Creds and Setting up Anthropics Tool Usage package.\n", - " # (\n", - " # \"anthropic_tool_user\",\n", - " # \"claude-2.1\",\n", - " # ),\n", - "]" + "experiment_id = uuid.uuid4().hex[:]" ] }, { "cell_type": "code", "execution_count": null, - "id": "5ddf7355-7db9-4adc-bc1e-f04c3d0ec57d", - "metadata": { - "tags": [] - }, + "id": "b2a3463b-1c9f-494b-bcbd-1dc1760ebf19", + "metadata": {}, "outputs": [], "source": [ "client = Client() # Launch langsmith client for cloning datasets\n", "today = datetime.date.today().isoformat()\n", - "rate_limiter = RateLimiter(requests_per_second=2)\n", "\n", - "for task in registry:\n", + "# You can use an optional rate limiter to rate limit your requests!\n", + "rate_limiter = RateLimiter(requests_per_second=1)\n", + "\n", + "\n", + "# Set up 2-tuples of (model name, model instance)\n", + "tests = [\n", + " (\n", + " \"claude-3-haiku-20240307\",\n", + " ChatAnthropic(model=\"claude-3-haiku-20240307\", temperature=0),\n", + " )\n", + "]\n", + "\n", + "\n", + "for task in registry.tasks:\n", " if task.type != \"ToolUsageTask\":\n", " continue\n", "\n", - " dataset_name = task.name\n", + " dataset_name = task.name + f\" ({today})\"\n", " clone_public_dataset(task.dataset_id, dataset_name=dataset_name)\n", "\n", - " for arch, model in tests:\n", + " for model_name, model in tests:\n", " print()\n", - " print(f\"Benchmarking {task.name} with model: {model} and arch: {arch}\")\n", + " print(f\"Benchmarking {task.name} with model: {model_name}\")\n", " eval_config = task.get_eval_config()\n", "\n", - " if arch == \"openai_functions\":\n", - " agent_factory = OpenAIAgentFactory(\n", - " task, model=model, rate_limiter=rate_limiter\n", - " )\n", - " elif arch == \"custom_agent\":\n", - " agent_factory = CustomAgentFactory(\n", - " task, model=model, rate_limiter=rate_limiter\n", - " )\n", - " elif arch == \"custom_runnable_agent\":\n", - " # For this, the model would have to be a runnable object\n", - " agent_factory = CustomRunnableAgentFactory(task, agent=model)\n", - " elif arch == \"anthropic_tool_user\":\n", - " agent_factory = AnthropicToolUserFactory(task)\n", - " else:\n", - " raise ValueError()\n", + " agent_factory = StandardAgentFactory(\n", + " task, model, prompt, rate_limiter=rate_limiter\n", + " )\n", "\n", " client.run_on_dataset(\n", " dataset_name=dataset_name,\n", " llm_or_chain_factory=agent_factory,\n", " evaluation=eval_config,\n", " verbose=False,\n", - " project_name=f\"{model}-{task.name}-{today}-{experiment_uuid}\",\n", - " tags=[model],\n", + " project_name=f\"{model_name}-{task.name}-{today}-{experiment_id}\",\n", " concurrency_level=5,\n", " project_metadata={\n", - " \"model\": model,\n", + " \"model\": model_name,\n", " \"id\": experiment_uuid,\n", " \"task\": task.name,\n", " \"date\": today,\n", " \"langchain_benchmarks_version\": __version__,\n", - " \"arch\": arch,\n", " },\n", " )" ] @@ -732,6 +606,8 @@ "\n", "The following sections demonstrate slightly more \"advanced\" usage if you want to completely customize the agent runtime in a way that is compatible with our test runner.\n", "\n", + "We'll also apply an adapter to the agent which will will capture its inputs and outputs (e.g, add information the agent's environment at the end of the run) so that it we can evaluate it.\n", + "\n", "### Custom Agent Factory\n", "\n", "If you want even more configurability beyond what the `CustomRunnableAgentFactory` provides, you can create your owne `AgentFactory` using the following pattern.\n", @@ -742,33 +618,33 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "bca8ad69-9956-451c-b639-ea30c77d982f", - "metadata": { - "tags": [] - }, + "execution_count": 16, + "id": "69351864-2e97-43df-81ae-5067cbf5e471", + "metadata": {}, "outputs": [], "source": [ - "from langchain.agents import AgentType, initialize_agent\n", - "from langchain.chat_models import ChatOpenAI\n", + "from typing import Optional\n", + "\n", + "from langchain.agents import AgentExecutor, create_tool_calling_agent\n", + "from langchain_anthropic import ChatAnthropic\n", + "from langchain_core.prompts import ChatPromptTemplate\n", "\n", "from langchain_benchmarks.schema import ExtractionTask\n", - "from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "44839ebe-48ea-4d5b-87b4-2ad72acacb71", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "class AgentFactory:\n", - " def __init__(self, task: ExtractionTask, model: str) -> None:\n", + "from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter\n", + "\n", + "\n", + "class CustomAgentFactory:\n", + " def __init__(\n", + " self,\n", + " task: ExtractionTask,\n", + " *,\n", + " # It can be useful to add a rate-limiter\n", + " # which will limit ther number of requests per second\n", + " # when running evaluation.\n", + " rate_limiter: Optional[RateLimiter] = None,\n", + " ) -> None:\n", " self.task = task\n", - " self.model = model\n", + " self.rate_limiter = rate_limiter\n", "\n", " def __call__(self):\n", " # This factory creates a new environment for every agent run.\n", @@ -777,63 +653,127 @@ " # At the end of the run, the environment state will be read.\n", " env = task.create_environment() # Create a new environment for every agent run!\n", " tools = env.tools\n", - " llm = ChatOpenAI(temperature=0, model=self.model)\n", - " agent_executor = initialize_agent(\n", - " tools,\n", - " llm,\n", - " agent=AgentType.OPENAI_FUNCTIONS,\n", - " return_intermediate_steps=True,\n", + " model = ChatAnthropic(model=\"claude-3-opus-20240229\", temperature=0)\n", + " prompt = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", self.task.instructions),\n", + " (\n", + " \"human\",\n", + " \"{question}\",\n", + " ), # Populated from task.instructions automatically\n", + " (\"placeholder\", \"{agent_scratchpad}\"),\n", + " ]\n", + " )\n", + "\n", + " # This is the standard tool calling agent implementation\n", + " # Feel free to replace it with any other implementation you want!\n", + " # https://python.langchain.com/docs/modules/agents/how_to/custom_agent/\n", + " agent = create_tool_calling_agent(model, env.tools, prompt)\n", + "\n", + " if self.rate_limiter:\n", + " agent = with_rate_limit(agent, self.rate_limiter)\n", + "\n", + " executor = AgentExecutor(\n", + " agent=agent,\n", + " tools=env.tools,\n", " handle_parsing_errors=True,\n", + " return_intermediate_steps=True,\n", " )\n", + "\n", " # Apply the adapters so that inputs and outputs match dataset schema\n", " # state_reader automatically adds the state of the environment at the end of the run.\n", - " return apply_agent_executor_adapter(agent_executor, state_reader=env.read_state)" + " return apply_agent_executor_adapter(executor, state_reader=env.read_state)" ] }, { "cell_type": "code", - "execution_count": 24, - "id": "8b6108e4-c7cc-42e8-a23d-89c7b94fab6c", - "metadata": { - "tags": [] - }, + "execution_count": 17, + "id": "18a96a6f-812b-4b0e-83c5-d001bf50851e", + "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Name Tool Usage - Typewriter (26 tools)
Type ToolUsageTask
Dataset ID 128af05e-aa00-4e3b-a958-d166dd450581
DescriptionEnvironment with 26 tools each tool represents a letter of the alphabet.\n", + "\n", + "The objective of this task is to evaluate the model's ability the use tools\n", + "for a simple repetition task.\n", + "\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n", + "\n", + "This is a variation of the typer writer task, where 26 parameterless tools are\n", + "given instead of a single tool that takes a letter as an argument.
" + ], "text/plain": [ - "{'input': 'xypxy',\n", - " 'output': 'I have typed \"xypxy\" as you requested.',\n", - " 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'x'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'x'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"x\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'y'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'y'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"y\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'p'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'p'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"p\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'x'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'x'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"x\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'y'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'y'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"y\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK')],\n", - " 'state': 'xypxy'}" + "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})" ] }, - "execution_count": 24, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "agent_factory = AgentFactory(task, \"gpt-4\")\n", - "agent = agent_factory()\n", - "agent.invoke({\"question\": \"xypxy\"})" + "task" ] }, { "cell_type": "code", - "execution_count": null, - "id": "9bdf9328-0103-48d3-8dfc-933423db9796", + "execution_count": 18, + "id": "a7bd4af3-c0f1-4308-abbf-330d7497b3e3", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "custom_agent_factory = CustomAgentFactory(task)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c5b69b7c-4294-47d1-85d7-47d718945898", + "metadata": {}, + "outputs": [], + "source": [ + "agent = custom_agent_factory()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "1ac24ef5-d3ca-41aa-b888-7ebcd8a92ff4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'abc',\n", + " 'output': [],\n", + " 'intermediate_steps': [(ToolAgentAction(tool='a', tool_input={}, log='\\nInvoking: `a` with `{}`\\nresponded: [{\\'text\\': \\'\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_016f6CZwwFmdz2h8KbdGRVjj'),\n", + " 'OK'),\n", + " (ToolAgentAction(tool='b', tool_input={}, log='\\nInvoking: `b` with `{}`\\nresponded: [{\\'text\\': \\'\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_01JvfeTpU3hEuS7PknFk5a8S'),\n", + " 'OK'),\n", + " (ToolAgentAction(tool='c', tool_input={}, log='\\nInvoking: `c` with `{}`\\nresponded: [{\\'text\\': \\'\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_01NbBCY5Fg62RsyAAUd4n2g1'),\n", + " 'OK')],\n", + " 'state': 'abc'}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agent.invoke({\"question\": \"abc\"})" + ] } ], "metadata": { From da4731b74adf3eb2b42ce9a6dfbf46348c1cc09d Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Thu, 18 Apr 2024 10:44:54 -0400 Subject: [PATCH 3/3] x --- .../tool_usage/agents/adapters.py | 23 ++----------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/langchain_benchmarks/tool_usage/agents/adapters.py b/langchain_benchmarks/tool_usage/agents/adapters.py index be060f8..b5ecbe4 100644 --- a/langchain_benchmarks/tool_usage/agents/adapters.py +++ b/langchain_benchmarks/tool_usage/agents/adapters.py @@ -41,27 +41,8 @@ def _read_state(*args: Any, **kwargs: Any) -> Any: else: return None - def _format_input(inputs: dict) -> dict: - """Make sure that the input is always called `input`.""" - - if "question" not in inputs: - raise ValueError( - "Expected 'question' to be in the inputs. Found only the following " - f"keys {sorted(inputs.keys())}." - ) - - inputs = inputs.copy() # Because 'question' is popped below - - if "input" not in inputs: - return {"input": inputs.pop("question"), **inputs} - return inputs - - runnable = ( - RunnableLambda(_format_input).with_config({"run_name": "Format Input"}) - | agent_executor - | RunnableLambda(_ensure_output_exists).with_config( - {"run_name": "Ensure Output"} - ) + runnable = agent_executor | RunnableLambda(_ensure_output_exists).with_config( + {"run_name": "Ensure Output"} ) if state_reader is not None: