-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add additional tasks, re-org repo a bit (#26)
* Push registry to top level * Rename environments to tasks * Tool usage tasks can create an environment; an environment can be associated with a state that can be read * Add additional tasks
- Loading branch information
Showing
13 changed files
with
599 additions
and
196 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
from langchain_benchmarks.registration import registry | ||
from langchain_benchmarks.utils._langsmith import ( | ||
clone_public_dataset, | ||
download_public_dataset, | ||
) | ||
|
||
# Please keep this list sorted! | ||
__all__ = ["clone_public_dataset", "download_public_dataset"] | ||
__all__ = ["clone_public_dataset", "download_public_dataset", "registry"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
"""Registry of environments for ease of access.""" | ||
import dataclasses | ||
from typing import Sequence, Union | ||
|
||
from tabulate import tabulate | ||
|
||
from langchain_benchmarks.schema import Task | ||
from langchain_benchmarks.tool_usage.environments import ( | ||
relational_data, | ||
type_writer, | ||
type_writer_26_funcs, | ||
multiverse_math, | ||
) | ||
|
||
|
||
@dataclasses.dataclass(frozen=True) | ||
class Registry: | ||
tasks: Sequence[Task] | ||
|
||
def get_task(self, name_or_id: Union[int, str]) -> Task: | ||
"""Get the environment with the given name.""" | ||
for env in self.tasks: | ||
if env.name == name_or_id or env.id == name_or_id: | ||
return env | ||
raise ValueError(f"Unknown task {name_or_id}") | ||
|
||
def __post_init__(self) -> None: | ||
"""Validate that all the tasks have unique names and IDs.""" | ||
seen_names = set() | ||
seen_ids = set() | ||
for task in self.tasks: | ||
if task.name in seen_names: | ||
raise ValueError( | ||
f"Duplicate task name {task.name}. " f"Task names must be unique." | ||
) | ||
seen_names.add(task.name) | ||
if task.id in seen_ids: | ||
raise ValueError( | ||
f"Duplicate task ID {task.id}. " f"Task IDs must be unique." | ||
) | ||
|
||
def _repr_html_(self) -> str: | ||
"""Return a HTML representation of the registry.""" | ||
headers = [ | ||
"ID", | ||
"Name", | ||
"Dataset ID", | ||
"Description", | ||
] | ||
table = [ | ||
[ | ||
env.id, | ||
env.name, | ||
env.dataset_id, | ||
env.description, | ||
] | ||
for env in self.tasks | ||
] | ||
return tabulate(table, headers=headers, tablefmt="html") | ||
|
||
def __getitem__(self, key: Union[int, str]) -> Task: | ||
"""Get an environment from the registry.""" | ||
if isinstance(key, slice): | ||
raise NotImplementedError("Slicing is not supported.") | ||
elif isinstance(key, (int, str)): | ||
# If key is an integer, return the corresponding environment | ||
return self.get_task(key) | ||
else: | ||
raise TypeError("Key must be an integer or a slice.") | ||
|
||
|
||
# Using lower case naming to make a bit prettier API when used in a notebook | ||
registry = Registry( | ||
tasks=[ | ||
Task( | ||
id=0, | ||
name="Tool Usage - Relational Data", | ||
dataset_id=relational_data.DATASET_ID, | ||
create_environment=relational_data.get_environment, | ||
instructions=( | ||
"""\ | ||
Please answer the user's question by using the tools provided. Do not guess the \ | ||
answer. Keep in mind that entities like users,foods and locations have both a \ | ||
name and an ID, which are not the same.""" | ||
), | ||
description=( | ||
"""\ | ||
Environment with fake data about users and their locations and favorite foods. | ||
The environment provides a set of tools that can be used to query the data. | ||
The objective of this task is to evaluate the ability to use the provided tools \ | ||
to answer questions about relational data. | ||
The dataset contains 21 examples of varying difficulty. The difficulty is measured \ | ||
by the number of tools that need to be used to answer the question. | ||
Each example is composed of a question, a reference answer, and \ | ||
information about the sequence in which tools should be used to answer \ | ||
the question. | ||
Success is measured by the ability to answer the question correctly, and efficiently. | ||
""" | ||
), | ||
), | ||
Task( | ||
id=1, | ||
name="Tool Usage - Typewriter (1 func)", | ||
dataset_id="placeholder", | ||
create_environment=type_writer.get_environment, | ||
instructions=( | ||
"Repeat the given string by using the provided tools. " | ||
"Do not write anything else or provide any explanations. " | ||
"For example, if the string is 'abc', you must invoke the tools " | ||
"'a', 'b', and 'c' in that order. " | ||
"Please invoke the function with a single letter at a time." | ||
), | ||
description=( | ||
"""\ | ||
Environment with a single function that accepts a single letter as input, and \ | ||
"prints" it on a piece of paper. | ||
The objective of this task is to evaluate the ability to use the provided \ | ||
tools to repeat a given input string. | ||
For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \ | ||
in that order. | ||
The dataset includes examples of varying difficulty. The difficulty is measured \ | ||
by the length of the string. | ||
""" | ||
), | ||
), | ||
Task( | ||
id=2, | ||
name="Tool Usage - Typewriter", | ||
dataset_id="placeholder", | ||
create_environment=type_writer_26_funcs.get_environment, | ||
instructions=( | ||
"Repeat the given string by using the provided tools. " | ||
"Do not write anything else or provide any explanations. " | ||
"For example, if the string is 'abc', you must invoke the tools " | ||
"'a', 'b', and 'c' in that order. " | ||
"Please invoke the functions without any arguments." | ||
), | ||
description=( | ||
"""\ | ||
Environment with 26 functions each representing a letter of the alphabet. | ||
In this variation of the typewriter task, there are 26 parameterless functions, where \ | ||
each function represents a letter of the alphabet (instead of a single function that \ | ||
takes a letter as an argument). | ||
The object is to evaluate the ability of use the functions to repeat the given string. | ||
For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \ | ||
in that order. | ||
The dataset includes examples of varying difficulty. The difficulty is measured \ | ||
by the length of the string. | ||
""" | ||
), | ||
), | ||
Task( | ||
id=3, | ||
name="Multiverse Math", | ||
dataset_id="placeholder", | ||
create_environment=multiverse_math.get_environment, | ||
instructions=( | ||
"You are requested to solve math questions in an alternate " | ||
"mathematical universe. The rules of association, commutativity, " | ||
"and distributivity still apply, but the operations have been " | ||
"altered to yield different results than expected. Solve the " | ||
"given math questions using the provided tools. " | ||
"Do not guess the answer." | ||
), | ||
description=( | ||
"""\ | ||
An environment that contains a few basic math operations, but with altered results. | ||
For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \ | ||
The basic operations retain some basic properties, such as commutativity, \ | ||
associativity, and distributivity; however, the results are different than expected. | ||
The objective of this task is to evaluate the ability to use the provided tools to \ | ||
solve simple math questions and ignore any innate knowledge about math. | ||
""" | ||
), | ||
), | ||
] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
"""Schema for the Langchain Benchmarks.""" | ||
import dataclasses | ||
from typing import List, Callable, Any, Optional | ||
|
||
from langchain.tools import BaseTool | ||
from tabulate import tabulate | ||
|
||
|
||
@dataclasses.dataclass(frozen=True) | ||
class Environment: | ||
"""An instance of an environment for tool usage.""" | ||
|
||
tools: List[BaseTool] | ||
"""The tools that can be used in the environment.""" | ||
|
||
read_state: Optional[Callable[[], Any]] = None | ||
"""A function that returns the current state of the environment.""" | ||
|
||
|
||
@dataclasses.dataclass(frozen=True) | ||
class Task: | ||
"""A definition for a task.""" | ||
|
||
id: int | ||
"""The ID of the environment.""" | ||
name: str | ||
"""The name of the environment.""" | ||
|
||
dataset_id: str | ||
"""The ID of the langsmith public dataset. | ||
This dataset contains expected inputs/outputs for the environment, and | ||
can be used to evaluate the performance of a model/agent etc. | ||
""" | ||
|
||
create_environment: Callable[ | ||
[], Environment | ||
] # Specialized for tool usage; refactor potentially | ||
"""Factory that returns an environment.""" | ||
|
||
description: str | ||
"""Description of the task for a data science practitioner. | ||
This can contain information about the task, the dataset, the tools available | ||
etc. | ||
""" | ||
|
||
instructions: str | ||
"""Instructions for the agent/chain/llm.""" | ||
|
||
def _repr_html_(self) -> str: | ||
"""Return an HTML representation of the environment.""" | ||
table = [ | ||
["ID", self.id], | ||
["Name", self.name], | ||
["Dataset ID", self.dataset_id], | ||
["Description", self.description[:100] + "..."], | ||
] | ||
return tabulate( | ||
table, | ||
tablefmt="html", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,4 @@ | ||
# Testing Agents | ||
# Tool usage | ||
|
||
This directory contains environments that can be used to test agent's ability | ||
to use tools and make decisions. | ||
|
||
## Environments | ||
|
||
Environments are named in the style of e[env_number]_[name].py. | ||
|
||
### e01_alpha | ||
|
||
* Consists of 3 relational tables of users, locations and foods. | ||
* Defines a set of tools that can be used these tables. | ||
* Agent should use the given tools to answer questions. | ||
|
||
## Running Evaluation | ||
|
||
Please refer to the following example to see how to set up and run evaluation | ||
for agents using [LangSmith](https://github.com/langchain-ai/langsmith-cookbook/blob/main/testing-examples/agent_steps/evaluating_agents.ipynb). | ||
This sub-package includes code to help test how well tools can be used to make | ||
decisions. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,5 @@ | ||
"""Package for helping to evaluate agent runs.""" | ||
from langchain_benchmarks.tool_usage.evaluators import STANDARD_AGENT_EVALUATOR | ||
from langchain_benchmarks.tool_usage.registration import registry | ||
|
||
# Please keep this list sorted! | ||
__all__ = ["registry", "STANDARD_AGENT_EVALUATOR"] | ||
__all__ = ["STANDARD_AGENT_EVALUATOR"] |
88 changes: 88 additions & 0 deletions
88
langchain_benchmarks/tool_usage/environments/multiverse_math.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
"""Solve basic math question using the provided tools. | ||
Must use the provided tools to solve the math question. | ||
To make sure that innate knowledge is not used, the math operations | ||
have been altered to yield different results than expected. | ||
The modified operations should yield different results, but still retain | ||
appropriate properties. For example, the modified multiplication operation | ||
should still be commutative. | ||
""" | ||
import math | ||
from typing import cast, List | ||
|
||
from langchain.tools import tool, BaseTool | ||
|
||
from langchain_benchmarks.schema import Environment | ||
|
||
|
||
def multiply(a: float, b: float) -> float: | ||
"""Multiply two numbers; a * b.""" | ||
return 1.1 * a * b | ||
|
||
|
||
def divide(a: float, b: float) -> float: | ||
"""Divide two numbers; a / b.""" | ||
# Division is neither commutative nor associative | ||
return a / (b + 0.5) | ||
|
||
|
||
def add(a: float, b: float) -> float: | ||
"""Add two numbers; a + b.""" | ||
return a + b + 1.2 | ||
|
||
|
||
def subtract(a: float, b: float) -> float: | ||
"""Subtract two numbers; a - b.""" | ||
return a - b - 3 | ||
|
||
|
||
def power(a: float, b: float) -> float: | ||
"""Raise a number to a power; a ** b.""" | ||
return a ** (b + 2) | ||
|
||
|
||
def log(a: float, base: float) -> float: | ||
"""Take the log of a number; log(a, base).""" | ||
return math.log(a, base + 1.5) | ||
|
||
|
||
def negate(a: float) -> float: | ||
"""Negate a number; -a.""" | ||
return a # negation does not negate the number | ||
|
||
|
||
# Temporary dataset | ||
DATASET = [ | ||
# 2-tuple format of (question, answer) | ||
("Add 2 and 3", add(2, 3)), | ||
("Subtract 3 from 2", subtract(2, 3)), | ||
( | ||
"I ate 1 apple and 2 oranges every day for 7 days. How many fruits did I eat?", | ||
multiply(7, add(1, 2)), | ||
), | ||
( | ||
"what is the result of 2 to the power of 3?", | ||
power(2, 3), | ||
), | ||
( | ||
"calculate sqrt of 101 to 4 digits of precision", | ||
round(power(101, 0.4), 4), | ||
), | ||
] | ||
|
||
|
||
# PUBLIC API | ||
|
||
|
||
def get_environment() -> Environment: | ||
"""Create an environment.""" | ||
tools = cast( | ||
List[BaseTool], | ||
[tool(func) for func in [multiply, add, divide, subtract, power, log, negate]], | ||
) | ||
return Environment( | ||
tools=tools, | ||
read_state=None, | ||
) |
Oops, something went wrong.