diff --git a/docs/source/notebooks/tool_usage.ipynb b/docs/source/notebooks/tool_usage.ipynb index 5af53720..51091058 100644 --- a/docs/source/notebooks/tool_usage.ipynb +++ b/docs/source/notebooks/tool_usage.ipynb @@ -19,8 +19,7 @@ }, "outputs": [], "source": [ - "from langchain_benchmarks.tool_usage import registry\n", - "from langchain_benchmarks import clone_public_dataset" + "from langchain_benchmarks import clone_public_dataset, registry" ] }, { @@ -44,29 +43,46 @@ "text/html": [ "\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", + "\n", + "\n", "\n", "
IDName Dataset ID Description
IDName Dataset ID Description
0Tool Usage - Alphae95d45da-aaa3-44b3-ba2b-7c15ff6e46f5Environment with fake data about users and their locations and favorite foods.\n", + "
0Tool Usage - Relational Data e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5Environment with fake data about users and their locations and favorite foods.\n", "\n", "The environment provides a set of tools that can be used to query the data.\n", "\n", - "The object is to evaluate the ability of an agent to use the tools\n", - "to answer questions about the data.\n", + "The objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\n", "\n", - "The dataset contains 21 examples of varying difficulty. The difficulty is measured\n", - "by the number of tools that need to be used to answer the question.\n", + "The dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\n", "\n", - "Each example is composed of a question, a reference answer, and\n", - "information about the sequence in which tools should be used to answer\n", - "the question.\n", + "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n", "\n", "Success is measured by the ability to answer the question correctly, and efficiently.
1Tool Usage - Typewriter (1 func)placeholder Environment with a single function that accepts a single letter as input, and "prints" it on a piece of paper.\n", + "\n", + "The objective of this task is to evaluate the ability to use the provided tools to repeat a given input string.\n", + "\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.
2Tool Usage - Typewriter placeholder Environment with 26 functions each representing a letter of the alphabet.\n", + "\n", + "In this variation of the typewriter task, there are 26 parameterless functions, where each function represents a letter of the alphabet (instead of a single function that takes a letter as an argument).\n", + "\n", + "The object is to evaluate the ability of use the functions to repeat the given string.\n", + "\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.
3Multiverse Math placeholder An environment that contains a few basic math operations, but with altered results.\n", + "\n", + "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n", + "\n", + "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.
" ], "text/plain": [ - "Registry(environments=[Environment(id=0, name='Tool Usage - Alpha', dataset_id='e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5', tools_factory=, description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe object is to evaluate the ability of an agent to use the tools\\nto answer questions about the data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured\\nby the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and\\ninformation about the sequence in which tools should be used to answer\\nthe question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n')])" + "Registry(tasks=[Task(id=0, name='Tool Usage - Relational Data', dataset_id='e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5', create_environment=, description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\"), Task(id=1, name='Tool Usage - Typewriter (1 func)', dataset_id='placeholder', create_environment=, description='Environment with a single function that accepts a single letter as input, and \"prints\" it on a piece of paper.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is \\'abc\\', the tools \\'a\\', \\'b\\', and \\'c\\' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n', instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the function with a single letter at a time.\"), Task(id=2, name='Tool Usage - Typewriter', dataset_id='placeholder', create_environment=, description=\"Environment with 26 functions each representing a letter of the alphabet.\\n\\nIn this variation of the typewriter task, there are 26 parameterless functions, where each function represents a letter of the alphabet (instead of a single function that takes a letter as an argument).\\n\\nThe object is to evaluate the ability of use the functions to repeat the given string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\"), Task(id=3, name='Multiverse Math', dataset_id='placeholder', create_environment=, description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', instructions='You are requested to solve math questions in an alternate mathematical universe. The rules of association, commutativity, and distributivity still apply, but the operations have been altered to yield different results than expected. Solve the given math questions using the provided tools. Do not guess the answer.')])" ] }, "execution_count": 2, @@ -81,19 +97,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "671282f8-c455-4390-b018-e53bbd833093", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "alpha = registry[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "cf6dca5d-63cf-4315-8206-726abe816473", + "id": "60f22779-a948-4833-8e8c-ace9ef17f56f", "metadata": { "tags": [] }, @@ -104,7 +108,7 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", "
ID 0
Name Tool Usage - Alpha
Name Tool Usage - Relational Data
Dataset ID e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5
DescriptionEnvironment with fake data about users and their locations and favorite foods.\n", "\n", @@ -113,16 +117,56 @@ "
" ], "text/plain": [ - "Environment(id=0, name='Tool Usage - Alpha', dataset_id='e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5', tools_factory=, description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe object is to evaluate the ability of an agent to use the tools\\nto answer questions about the data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured\\nby the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and\\ninformation about the sequence in which tools should be used to answer\\nthe question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n')" + "Task(id=0, name='Tool Usage - Relational Data', dataset_id='e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5', create_environment=, description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\")" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "alpha" + "task = registry[\"Tool Usage - Relational Data\"]\n", + "task" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "49be36d2-343e-49df-8369-dd5bac405d5e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Environment with fake data about users and their locations and favorite foods.\n", + "\n", + "The environment provides a set of tools that can be used to query the data.\n", + "\n", + "The objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\n", + "\n", + "The dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\n", + "\n", + "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n", + "\n", + "Success is measured by the ability to answer the question correctly, and efficiently.\n", + "\n" + ] + } + ], + "source": [ + "print(task.description)" + ] + }, + { + "cell_type": "markdown", + "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d", + "metadata": {}, + "source": [ + "Clone the dataset associaetd with this task" ] }, { @@ -133,17 +177,32 @@ "tags": [] }, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "908dfb7a73ea4332a77336ba00ed1ba4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/21 [00:00 AgentExecutor:\n", " \"\"\"Agent Executor\"\"\"\n", " llm = ChatOpenAI(\n", @@ -183,8 +241,10 @@ " temperature=0,\n", " )\n", "\n", + " env = task.create_environment()\n", + "\n", " llm_with_tools = llm.bind(\n", - " functions=[format_tool_to_openai_function(t) for t in TOOLS]\n", + " functions=[format_tool_to_openai_function(t) for t in env.tools]\n", " )\n", " prompt = ChatPromptTemplate.from_messages(\n", " [\n", @@ -218,7 +278,7 @@ " return (\n", " AgentExecutor(\n", " agent=runnable_agent,\n", - " tools=TOOLS,\n", + " tools=env.tools,\n", " handle_parsing_errors=True,\n", " return_intermediate_steps=True,\n", " )\n", @@ -236,27 +296,29 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 7, "id": "0e4896fa-3633-44a1-857f-80a263cf2e03", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { "text/plain": [ "{'question': 'who is bob?',\n", " 'output': 'Bob is a user with the ID 21.',\n", - " 'intermediate_steps': [(AgentActionMessageLog(tool='find_users_by_name', tool_input={'name': 'bob'}, log=\"\\nInvoking: `find_users_by_name` with `{'name': 'bob'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n\"name\": \"bob\"\\n}', 'name': 'find_users_by_name'}})]),\n", + " 'intermediate_steps': [(AgentActionMessageLog(tool='find_users_by_name', tool_input={'name': 'bob'}, log=\"\\nInvoking: `find_users_by_name` with `{'name': 'bob'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"name\": \"bob\"\\n}', 'name': 'find_users_by_name'}})]),\n", " [{'id': 21, 'name': 'Bob'},\n", " {'id': 41, 'name': 'Donna'},\n", " {'id': 1, 'name': 'Alice'},\n", " {'id': 35, 'name': 'Charlie'},\n", " {'id': 42, 'name': 'Eve'},\n", " {'id': 43, 'name': 'Frank The Cat'}]),\n", - " (AgentActionMessageLog(tool='get_user_name', tool_input={'user_id': 21}, log=\"\\nInvoking: `get_user_name` with `{'user_id': 21}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n\"user_id\": 21\\n}', 'name': 'get_user_name'}})]),\n", + " (AgentActionMessageLog(tool='get_user_name', tool_input={'user_id': 21}, log=\"\\nInvoking: `get_user_name` with `{'user_id': 21}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"user_id\": 21\\n}', 'name': 'get_user_name'}})]),\n", " 'Bob')]}" ] }, - "execution_count": 23, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -277,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 8, "id": "513042fe-2878-44f8-ae84-05b9d521c1de", "metadata": { "tags": [] @@ -290,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 9, "id": "2bedd9d1-fc06-4066-9f89-b874ae818d82", "metadata": { "tags": [] @@ -304,11 +366,13 @@ "cell_type": "code", "execution_count": null, "id": "aab7514e-a6ef-4c21-b90f-d9cbefcf5af1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "test_run = client.run_on_dataset(\n", - " dataset_name=alpha.name,\n", + " dataset_name=task.name,\n", " llm_or_chain_factory=agent_factory,\n", " evaluation=STANDARD_AGENT_EVALUATOR,\n", " verbose=True,\n", diff --git a/langchain_benchmarks/__init__.py b/langchain_benchmarks/__init__.py index 0bdaa0ac..deb45bc9 100644 --- a/langchain_benchmarks/__init__.py +++ b/langchain_benchmarks/__init__.py @@ -1,7 +1,8 @@ +from langchain_benchmarks.registration import registry from langchain_benchmarks.utils._langsmith import ( clone_public_dataset, download_public_dataset, ) # Please keep this list sorted! -__all__ = ["clone_public_dataset", "download_public_dataset"] +__all__ = ["clone_public_dataset", "download_public_dataset", "registry"] diff --git a/langchain_benchmarks/registration.py b/langchain_benchmarks/registration.py new file mode 100644 index 00000000..cb1c0f3a --- /dev/null +++ b/langchain_benchmarks/registration.py @@ -0,0 +1,191 @@ +"""Registry of environments for ease of access.""" +import dataclasses +from typing import Sequence, Union + +from tabulate import tabulate + +from langchain_benchmarks.schema import Task +from langchain_benchmarks.tool_usage.environments import ( + relational_data, + type_writer, + type_writer_26_funcs, + multiverse_math, +) + + +@dataclasses.dataclass(frozen=True) +class Registry: + tasks: Sequence[Task] + + def get_task(self, name_or_id: Union[int, str]) -> Task: + """Get the environment with the given name.""" + for env in self.tasks: + if env.name == name_or_id or env.id == name_or_id: + return env + raise ValueError(f"Unknown task {name_or_id}") + + def __post_init__(self) -> None: + """Validate that all the tasks have unique names and IDs.""" + seen_names = set() + seen_ids = set() + for task in self.tasks: + if task.name in seen_names: + raise ValueError( + f"Duplicate task name {task.name}. " f"Task names must be unique." + ) + seen_names.add(task.name) + if task.id in seen_ids: + raise ValueError( + f"Duplicate task ID {task.id}. " f"Task IDs must be unique." + ) + + def _repr_html_(self) -> str: + """Return a HTML representation of the registry.""" + headers = [ + "ID", + "Name", + "Dataset ID", + "Description", + ] + table = [ + [ + env.id, + env.name, + env.dataset_id, + env.description, + ] + for env in self.tasks + ] + return tabulate(table, headers=headers, tablefmt="html") + + def __getitem__(self, key: Union[int, str]) -> Task: + """Get an environment from the registry.""" + if isinstance(key, slice): + raise NotImplementedError("Slicing is not supported.") + elif isinstance(key, (int, str)): + # If key is an integer, return the corresponding environment + return self.get_task(key) + else: + raise TypeError("Key must be an integer or a slice.") + + +# Using lower case naming to make a bit prettier API when used in a notebook +registry = Registry( + tasks=[ + Task( + id=0, + name="Tool Usage - Relational Data", + dataset_id=relational_data.DATASET_ID, + create_environment=relational_data.get_environment, + instructions=( + """\ +Please answer the user's question by using the tools provided. Do not guess the \ +answer. Keep in mind that entities like users,foods and locations have both a \ +name and an ID, which are not the same.""" + ), + description=( + """\ +Environment with fake data about users and their locations and favorite foods. + +The environment provides a set of tools that can be used to query the data. + +The objective of this task is to evaluate the ability to use the provided tools \ +to answer questions about relational data. + +The dataset contains 21 examples of varying difficulty. The difficulty is measured \ +by the number of tools that need to be used to answer the question. + +Each example is composed of a question, a reference answer, and \ +information about the sequence in which tools should be used to answer \ +the question. + +Success is measured by the ability to answer the question correctly, and efficiently. +""" + ), + ), + Task( + id=1, + name="Tool Usage - Typewriter (1 func)", + dataset_id="placeholder", + create_environment=type_writer.get_environment, + instructions=( + "Repeat the given string by using the provided tools. " + "Do not write anything else or provide any explanations. " + "For example, if the string is 'abc', you must invoke the tools " + "'a', 'b', and 'c' in that order. " + "Please invoke the function with a single letter at a time." + ), + description=( + """\ +Environment with a single function that accepts a single letter as input, and \ +"prints" it on a piece of paper. + +The objective of this task is to evaluate the ability to use the provided \ + tools to repeat a given input string. + +For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \ +in that order. + +The dataset includes examples of varying difficulty. The difficulty is measured \ +by the length of the string. +""" + ), + ), + Task( + id=2, + name="Tool Usage - Typewriter", + dataset_id="placeholder", + create_environment=type_writer_26_funcs.get_environment, + instructions=( + "Repeat the given string by using the provided tools. " + "Do not write anything else or provide any explanations. " + "For example, if the string is 'abc', you must invoke the tools " + "'a', 'b', and 'c' in that order. " + "Please invoke the functions without any arguments." + ), + description=( + """\ +Environment with 26 functions each representing a letter of the alphabet. + +In this variation of the typewriter task, there are 26 parameterless functions, where \ +each function represents a letter of the alphabet (instead of a single function that \ +takes a letter as an argument). + +The object is to evaluate the ability of use the functions to repeat the given string. + +For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \ +in that order. + +The dataset includes examples of varying difficulty. The difficulty is measured \ +by the length of the string. +""" + ), + ), + Task( + id=3, + name="Multiverse Math", + dataset_id="placeholder", + create_environment=multiverse_math.get_environment, + instructions=( + "You are requested to solve math questions in an alternate " + "mathematical universe. The rules of association, commutativity, " + "and distributivity still apply, but the operations have been " + "altered to yield different results than expected. Solve the " + "given math questions using the provided tools. " + "Do not guess the answer." + ), + description=( + """\ +An environment that contains a few basic math operations, but with altered results. + +For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \ +The basic operations retain some basic properties, such as commutativity, \ +associativity, and distributivity; however, the results are different than expected. + +The objective of this task is to evaluate the ability to use the provided tools to \ +solve simple math questions and ignore any innate knowledge about math. +""" + ), + ), + ] +) diff --git a/langchain_benchmarks/schema.py b/langchain_benchmarks/schema.py new file mode 100644 index 00000000..06717e6f --- /dev/null +++ b/langchain_benchmarks/schema.py @@ -0,0 +1,62 @@ +"""Schema for the Langchain Benchmarks.""" +import dataclasses +from typing import List, Callable, Any, Optional + +from langchain.tools import BaseTool +from tabulate import tabulate + + +@dataclasses.dataclass(frozen=True) +class Environment: + """An instance of an environment for tool usage.""" + + tools: List[BaseTool] + """The tools that can be used in the environment.""" + + read_state: Optional[Callable[[], Any]] = None + """A function that returns the current state of the environment.""" + + +@dataclasses.dataclass(frozen=True) +class Task: + """A definition for a task.""" + + id: int + """The ID of the environment.""" + name: str + """The name of the environment.""" + + dataset_id: str + """The ID of the langsmith public dataset. + + This dataset contains expected inputs/outputs for the environment, and + can be used to evaluate the performance of a model/agent etc. + """ + + create_environment: Callable[ + [], Environment + ] # Specialized for tool usage; refactor potentially + """Factory that returns an environment.""" + + description: str + """Description of the task for a data science practitioner. + + This can contain information about the task, the dataset, the tools available + etc. + """ + + instructions: str + """Instructions for the agent/chain/llm.""" + + def _repr_html_(self) -> str: + """Return an HTML representation of the environment.""" + table = [ + ["ID", self.id], + ["Name", self.name], + ["Dataset ID", self.dataset_id], + ["Description", self.description[:100] + "..."], + ] + return tabulate( + table, + tablefmt="html", + ) diff --git a/langchain_benchmarks/tool_usage/README.md b/langchain_benchmarks/tool_usage/README.md index 3d88253c..5c5d033c 100644 --- a/langchain_benchmarks/tool_usage/README.md +++ b/langchain_benchmarks/tool_usage/README.md @@ -1,19 +1,4 @@ -# Testing Agents +# Tool usage -This directory contains environments that can be used to test agent's ability -to use tools and make decisions. - -## Environments - -Environments are named in the style of e[env_number]_[name].py. - -### e01_alpha - -* Consists of 3 relational tables of users, locations and foods. -* Defines a set of tools that can be used these tables. -* Agent should use the given tools to answer questions. - -## Running Evaluation - -Please refer to the following example to see how to set up and run evaluation -for agents using [LangSmith](https://github.com/langchain-ai/langsmith-cookbook/blob/main/testing-examples/agent_steps/evaluating_agents.ipynb). +This sub-package includes code to help test how well tools can be used to make +decisions. \ No newline at end of file diff --git a/langchain_benchmarks/tool_usage/__init__.py b/langchain_benchmarks/tool_usage/__init__.py index b118d216..9f2458d7 100644 --- a/langchain_benchmarks/tool_usage/__init__.py +++ b/langchain_benchmarks/tool_usage/__init__.py @@ -1,6 +1,5 @@ """Package for helping to evaluate agent runs.""" from langchain_benchmarks.tool_usage.evaluators import STANDARD_AGENT_EVALUATOR -from langchain_benchmarks.tool_usage.registration import registry # Please keep this list sorted! -__all__ = ["registry", "STANDARD_AGENT_EVALUATOR"] +__all__ = ["STANDARD_AGENT_EVALUATOR"] diff --git a/langchain_benchmarks/tool_usage/environments/multiverse_math.py b/langchain_benchmarks/tool_usage/environments/multiverse_math.py new file mode 100644 index 00000000..b52d95cf --- /dev/null +++ b/langchain_benchmarks/tool_usage/environments/multiverse_math.py @@ -0,0 +1,88 @@ +"""Solve basic math question using the provided tools. + +Must use the provided tools to solve the math question. + +To make sure that innate knowledge is not used, the math operations +have been altered to yield different results than expected. + +The modified operations should yield different results, but still retain +appropriate properties. For example, the modified multiplication operation +should still be commutative. +""" +import math +from typing import cast, List + +from langchain.tools import tool, BaseTool + +from langchain_benchmarks.schema import Environment + + +def multiply(a: float, b: float) -> float: + """Multiply two numbers; a * b.""" + return 1.1 * a * b + + +def divide(a: float, b: float) -> float: + """Divide two numbers; a / b.""" + # Division is neither commutative nor associative + return a / (b + 0.5) + + +def add(a: float, b: float) -> float: + """Add two numbers; a + b.""" + return a + b + 1.2 + + +def subtract(a: float, b: float) -> float: + """Subtract two numbers; a - b.""" + return a - b - 3 + + +def power(a: float, b: float) -> float: + """Raise a number to a power; a ** b.""" + return a ** (b + 2) + + +def log(a: float, base: float) -> float: + """Take the log of a number; log(a, base).""" + return math.log(a, base + 1.5) + + +def negate(a: float) -> float: + """Negate a number; -a.""" + return a # negation does not negate the number + + +# Temporary dataset +DATASET = [ + # 2-tuple format of (question, answer) + ("Add 2 and 3", add(2, 3)), + ("Subtract 3 from 2", subtract(2, 3)), + ( + "I ate 1 apple and 2 oranges every day for 7 days. How many fruits did I eat?", + multiply(7, add(1, 2)), + ), + ( + "what is the result of 2 to the power of 3?", + power(2, 3), + ), + ( + "calculate sqrt of 101 to 4 digits of precision", + round(power(101, 0.4), 4), + ), +] + + +# PUBLIC API + + +def get_environment() -> Environment: + """Create an environment.""" + tools = cast( + List[BaseTool], + [tool(func) for func in [multiply, add, divide, subtract, power, log, negate]], + ) + return Environment( + tools=tools, + read_state=None, + ) diff --git a/langchain_benchmarks/tool_usage/environments/alpha.py b/langchain_benchmarks/tool_usage/environments/relational_data.py similarity index 95% rename from langchain_benchmarks/tool_usage/environments/alpha.py rename to langchain_benchmarks/tool_usage/environments/relational_data.py index 0b813c6c..d7ce35e2 100644 --- a/langchain_benchmarks/tool_usage/environments/alpha.py +++ b/langchain_benchmarks/tool_usage/environments/relational_data.py @@ -1,19 +1,19 @@ -"""A simple environment for evaluating an agent. - -A simple environment to evaluate an agent's ability to use a set of given tools -to reference questions. +"""Answer questions about relational data using the provided tools. The environment contains fake data about users and their locations and favorite foods. -The environment defines a set of tools that the agent can use to access the data. +The environment provides a set of tools that can be used to query the data. -Agent performance should be evaluated solely based on the agent's ability to use -the tools to reference questions. +All questions can be answered by using the provided tools. The answers +include the expected result as well as the most efficient way to answer the +question using the tools. """ from typing import Callable, List, TypedDict from langchain.tools import BaseTool, tool +from langchain_benchmarks.schema import Environment + USER_DATA = [ # IDs are not consecutive to prevent agents from guessing the ID { @@ -397,5 +397,13 @@ def get_tools() -> List[BaseTool]: return [tool(f) for f in functions] +def get_environment() -> Environment: + """Create an environment.""" + return Environment( + tools=get_tools(), + read_state=None, + ) + + # ID of a dataset that contains the questions and references DATASET_ID = "e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5" # ID of Agent Gym: E01 Alpha diff --git a/langchain_benchmarks/tool_usage/environments/type_writer.py b/langchain_benchmarks/tool_usage/environments/type_writer.py new file mode 100644 index 00000000..5b1e55f8 --- /dev/null +++ b/langchain_benchmarks/tool_usage/environments/type_writer.py @@ -0,0 +1,57 @@ +"""A task where the agent must type a given string one letter at a time. + +In this variation of the task, the agent is given a single function, +that takes a letter as an argument. +""" +import dataclasses +from typing import Callable, Any, List, cast + +from langchain.tools import BaseTool, tool + +from langchain_benchmarks.schema import Environment + + +@dataclasses.dataclass +class Paper: + """A piece of paper that the agent can write on.""" + + content: str + + +def function(paper: Paper) -> Callable[[], str]: + """Create a function that types the given letter.""" + + def type_letter(letter: str) -> str: + """Print the given letter on the paper.""" + paper.content += letter + return "OK" + + return type_letter + + +# PUBLIC API + + +def get_environment() -> Environment: + """Create tools and state reader. + + Attention: this is a factory function, so it will create a new environment + every time it is called. The paper contains state. + + Returns: + A tuple of (tools, state_reader). + """ + paper = Paper(content="") # Start with an empty piece of paper + # functions = _get_available_functions(paper) + + def _read_state() -> Any: + """Read the state of the environment.""" + return paper.content + + # tools = cast(List[BaseTool], [tool(f) for f in functions]) + tools = cast(List[BaseTool], [tool(function(paper))]) + + return Environment( + tools=tools, + read_state=_read_state, + ) diff --git a/langchain_benchmarks/tool_usage/environments/type_writer_26_funcs.py b/langchain_benchmarks/tool_usage/environments/type_writer_26_funcs.py new file mode 100644 index 00000000..8150234a --- /dev/null +++ b/langchain_benchmarks/tool_usage/environments/type_writer_26_funcs.py @@ -0,0 +1,64 @@ +"""A task where the agent must type a given string one letter at a time. + +In this variation of the task, the agent is given access to 26 parameterless functions, +each representing a letter of the alphabet. +""" +import dataclasses +from typing import Callable, Any, List, cast + +from langchain.tools import BaseTool, tool + +from langchain_benchmarks.schema import Environment + + +@dataclasses.dataclass +class Paper: + """A piece of paper that the agent can write on.""" + + content: str + + +def _create_typing_func(letter: str, paper: Paper) -> Callable[[], str]: + """Create a function that types the given letter.""" + + def func() -> str: + paper.content += letter + return "OK" + + func.__doc__ = f'Run to Type the letter "{letter}".' + func.__name__ = f"{letter}" + return func + + +def _get_available_functions(paper: Paper) -> List[Callable]: + """Get all the available functions.""" + return [ + _create_typing_func(letter, paper) for letter in "abcdefghijklmnopqrstuvwxyz" + ] + + +# PUBLIC API + + +def get_environment() -> Environment: + """Create tools and state reader. + + Attention: this is a factory function, so it will create a new environment + every time it is called. The paper contains state. + + Returns: + A tuple of (tools, state_reader). + """ + paper = Paper(content="") # Start with an empty piece of paper + functions = _get_available_functions(paper) + + def _read_state() -> Any: + """Read the state of the environment.""" + return paper.content + + tools = cast(List[BaseTool], [tool(f) for f in functions]) + + return Environment( + tools=tools, + read_state=_read_state, + ) diff --git a/langchain_benchmarks/tool_usage/registration.py b/langchain_benchmarks/tool_usage/registration.py deleted file mode 100644 index fb0cdff6..00000000 --- a/langchain_benchmarks/tool_usage/registration.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Registry of environments for ease of access.""" -import dataclasses -from typing import Callable, List, Sequence, Union - -from langchain.tools import BaseTool -from tabulate import tabulate - -from langchain_benchmarks.tool_usage.environments import alpha - - -@dataclasses.dataclass(frozen=True) -class Environment: - id: int - """The ID of the environment.""" - name: str - """The name of the environment.""" - - dataset_id: str - """The ID of the langsmith public dataset. - - This dataset contains expected inputs/outputs for the environment, and - can be used to evaluate the performance of a model/agent etc. - """ - - tools_factory: Callable[[], List[BaseTool]] - """Factory that returns a list of tools that can be used in the environment.""" - - description: str - """Description of the environment.""" - - def _repr_html_(self) -> str: - """Return a HTML representation of the environment.""" - table = [ - ["ID", self.id], - ["Name", self.name], - ["Dataset ID", self.dataset_id], - ["Description", self.description[:100] + "..."], - ] - return tabulate( - table, - tablefmt="html", - ) - - -@dataclasses.dataclass(frozen=True) -class Registry: - environments: Sequence[Environment] - - def get_environment(self, name_or_id: Union[int, str]) -> Environment: - """Get the environment with the given name.""" - for env in self.environments: - if env.name == name_or_id or env.id == name_or_id: - return env - raise ValueError( - f"Unknown environment {name_or_id}. Use list_environments() to see " - f"available environments." - ) - - def _repr_html_(self) -> str: - """Return a HTML representation of the registry.""" - headers = [ - "ID", - "Name", - "Dataset ID", - "Description", - ] - table = [ - [ - env.id, - env.name, - env.dataset_id, - env.description, - ] - for env in self.environments - ] - return tabulate(table, headers=headers, tablefmt="html") - - def __getitem__(self, key: Union[int, str]) -> Environment: - """Get an environment from the registry.""" - if isinstance(key, slice): - raise NotImplementedError("Slicing is not supported.") - elif isinstance(key, (int, str)): - # If key is an integer, return the corresponding environment - return self.get_environment(key) - else: - raise TypeError("Key must be an integer or a slice.") - - -# Using lower case naming to make a bit prettier API when used in a notebook -registry = Registry( - environments=[ - Environment( - id=0, - name="Tool Usage - Alpha", - dataset_id=alpha.DATASET_ID, - tools_factory=alpha.get_tools, - description=( - """\ -Environment with fake data about users and their locations and favorite foods. - -The environment provides a set of tools that can be used to query the data. - -The object is to evaluate the ability of an agent to use the tools -to answer questions about the data. - -The dataset contains 21 examples of varying difficulty. The difficulty is measured -by the number of tools that need to be used to answer the question. - -Each example is composed of a question, a reference answer, and -information about the sequence in which tools should be used to answer -the question. - -Success is measured by the ability to answer the question correctly, and efficiently. -""" - ), - ) - ] -) diff --git a/tests/unit_tests/test_public_api.py b/tests/unit_tests/test_public_api.py index 31251f14..6c4522ae 100644 --- a/tests/unit_tests/test_public_api.py +++ b/tests/unit_tests/test_public_api.py @@ -5,4 +5,6 @@ def test_public_api() -> None: """Test that the public API is correct.""" # This test will also fail if __all__ is not sorted. # Please keep it sorted! - assert __all__ == sorted(["clone_public_dataset", "download_public_dataset"]) + assert __all__ == sorted( + ["clone_public_dataset", "download_public_dataset", "registry"] + ) diff --git a/tests/unit_tests/tool_usage/test_public_api.py b/tests/unit_tests/tool_usage/test_public_api.py index e636fbef..8e60b79b 100644 --- a/tests/unit_tests/tool_usage/test_public_api.py +++ b/tests/unit_tests/tool_usage/test_public_api.py @@ -5,4 +5,4 @@ def test_public_api() -> None: """Test that the public API is correct.""" # This test will also fail if __all__ is not sorted. # Please keep it sorted! - assert __all__ == sorted(["registry", "STANDARD_AGENT_EVALUATOR"], key=str.lower) + assert __all__ == sorted(["STANDARD_AGENT_EVALUATOR"], key=str.lower)