Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional tasks, re-org repo a bit #26

Merged
merged 2 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 112 additions & 48 deletions docs/source/notebooks/tool_usage.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion langchain_benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from langchain_benchmarks.registration import registry
from langchain_benchmarks.utils._langsmith import (
clone_public_dataset,
download_public_dataset,
)

# Please keep this list sorted!
__all__ = ["clone_public_dataset", "download_public_dataset"]
__all__ = ["clone_public_dataset", "download_public_dataset", "registry"]
191 changes: 191 additions & 0 deletions langchain_benchmarks/registration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
"""Registry of environments for ease of access."""
import dataclasses
from typing import Sequence, Union

from tabulate import tabulate

from langchain_benchmarks.schema import Task
from langchain_benchmarks.tool_usage.environments import (
relational_data,
type_writer,
type_writer_26_funcs,
multiverse_math,
)


@dataclasses.dataclass(frozen=True)
class Registry:
tasks: Sequence[Task]

def get_task(self, name_or_id: Union[int, str]) -> Task:
"""Get the environment with the given name."""
for env in self.tasks:
if env.name == name_or_id or env.id == name_or_id:
return env
raise ValueError(f"Unknown task {name_or_id}")

def __post_init__(self) -> None:
"""Validate that all the tasks have unique names and IDs."""
seen_names = set()
seen_ids = set()
for task in self.tasks:
if task.name in seen_names:
raise ValueError(
f"Duplicate task name {task.name}. " f"Task names must be unique."
)
seen_names.add(task.name)
if task.id in seen_ids:
raise ValueError(
f"Duplicate task ID {task.id}. " f"Task IDs must be unique."
)

def _repr_html_(self) -> str:
"""Return a HTML representation of the registry."""
headers = [
"ID",
"Name",
"Dataset ID",
"Description",
]
table = [
[
env.id,
env.name,
env.dataset_id,
env.description,
]
for env in self.tasks
]
return tabulate(table, headers=headers, tablefmt="html")

def __getitem__(self, key: Union[int, str]) -> Task:
"""Get an environment from the registry."""
if isinstance(key, slice):
raise NotImplementedError("Slicing is not supported.")
elif isinstance(key, (int, str)):
# If key is an integer, return the corresponding environment
return self.get_task(key)
else:
raise TypeError("Key must be an integer or a slice.")


# Using lower case naming to make a bit prettier API when used in a notebook
registry = Registry(
tasks=[
Task(
id=0,
name="Tool Usage - Relational Data",
dataset_id=relational_data.DATASET_ID,
create_environment=relational_data.get_environment,
instructions=(
"""\
Please answer the user's question by using the tools provided. Do not guess the \
answer. Keep in mind that entities like users,foods and locations have both a \
name and an ID, which are not the same."""
),
description=(
"""\
Environment with fake data about users and their locations and favorite foods.

The environment provides a set of tools that can be used to query the data.

The objective of this task is to evaluate the ability to use the provided tools \
to answer questions about relational data.

The dataset contains 21 examples of varying difficulty. The difficulty is measured \
by the number of tools that need to be used to answer the question.

Each example is composed of a question, a reference answer, and \
information about the sequence in which tools should be used to answer \
the question.

Success is measured by the ability to answer the question correctly, and efficiently.
"""
),
),
Task(
id=1,
name="Tool Usage - Typewriter (1 func)",
dataset_id="placeholder",
create_environment=type_writer.get_environment,
instructions=(
"Repeat the given string by using the provided tools. "
"Do not write anything else or provide any explanations. "
"For example, if the string is 'abc', you must invoke the tools "
"'a', 'b', and 'c' in that order. "
"Please invoke the function with a single letter at a time."
),
description=(
"""\
Environment with a single function that accepts a single letter as input, and \
"prints" it on a piece of paper.

The objective of this task is to evaluate the ability to use the provided \
tools to repeat a given input string.

For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \
in that order.

The dataset includes examples of varying difficulty. The difficulty is measured \
by the length of the string.
"""
),
),
Task(
id=2,
name="Tool Usage - Typewriter",
dataset_id="placeholder",
create_environment=type_writer_26_funcs.get_environment,
instructions=(
"Repeat the given string by using the provided tools. "
"Do not write anything else or provide any explanations. "
"For example, if the string is 'abc', you must invoke the tools "
"'a', 'b', and 'c' in that order. "
"Please invoke the functions without any arguments."
),
description=(
"""\
Environment with 26 functions each representing a letter of the alphabet.

In this variation of the typewriter task, there are 26 parameterless functions, where \
each function represents a letter of the alphabet (instead of a single function that \
takes a letter as an argument).

The object is to evaluate the ability of use the functions to repeat the given string.

For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \
in that order.

The dataset includes examples of varying difficulty. The difficulty is measured \
by the length of the string.
"""
),
),
Task(
id=3,
name="Multiverse Math",
dataset_id="placeholder",
create_environment=multiverse_math.get_environment,
instructions=(
"You are requested to solve math questions in an alternate "
"mathematical universe. The rules of association, commutativity, "
"and distributivity still apply, but the operations have been "
"altered to yield different results than expected. Solve the "
"given math questions using the provided tools. "
"Do not guess the answer."
),
description=(
"""\
An environment that contains a few basic math operations, but with altered results.

For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \
The basic operations retain some basic properties, such as commutativity, \
associativity, and distributivity; however, the results are different than expected.

The objective of this task is to evaluate the ability to use the provided tools to \
solve simple math questions and ignore any innate knowledge about math.
"""
),
),
]
)
62 changes: 62 additions & 0 deletions langchain_benchmarks/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Schema for the Langchain Benchmarks."""
import dataclasses
from typing import List, Callable, Any, Optional

from langchain.tools import BaseTool
from tabulate import tabulate


@dataclasses.dataclass(frozen=True)
class Environment:
"""An instance of an environment for tool usage."""

tools: List[BaseTool]
"""The tools that can be used in the environment."""

read_state: Optional[Callable[[], Any]] = None
"""A function that returns the current state of the environment."""


@dataclasses.dataclass(frozen=True)
class Task:
"""A definition for a task."""

id: int
"""The ID of the environment."""
name: str
"""The name of the environment."""

dataset_id: str
"""The ID of the langsmith public dataset.

This dataset contains expected inputs/outputs for the environment, and
can be used to evaluate the performance of a model/agent etc.
"""

create_environment: Callable[
[], Environment
] # Specialized for tool usage; refactor potentially
"""Factory that returns an environment."""

description: str
"""Description of the task for a data science practitioner.

This can contain information about the task, the dataset, the tools available
etc.
"""

instructions: str
"""Instructions for the agent/chain/llm."""

def _repr_html_(self) -> str:
"""Return an HTML representation of the environment."""
table = [
["ID", self.id],
["Name", self.name],
["Dataset ID", self.dataset_id],
["Description", self.description[:100] + "..."],
]
return tabulate(
table,
tablefmt="html",
)
21 changes: 3 additions & 18 deletions langchain_benchmarks/tool_usage/README.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,4 @@
# Testing Agents
# Tool usage

This directory contains environments that can be used to test agent's ability
to use tools and make decisions.

## Environments

Environments are named in the style of e[env_number]_[name].py.

### e01_alpha

* Consists of 3 relational tables of users, locations and foods.
* Defines a set of tools that can be used these tables.
* Agent should use the given tools to answer questions.

## Running Evaluation

Please refer to the following example to see how to set up and run evaluation
for agents using [LangSmith](https://github.com/langchain-ai/langsmith-cookbook/blob/main/testing-examples/agent_steps/evaluating_agents.ipynb).
This sub-package includes code to help test how well tools can be used to make
decisions.
3 changes: 1 addition & 2 deletions langchain_benchmarks/tool_usage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Package for helping to evaluate agent runs."""
from langchain_benchmarks.tool_usage.evaluators import STANDARD_AGENT_EVALUATOR
from langchain_benchmarks.tool_usage.registration import registry

# Please keep this list sorted!
__all__ = ["registry", "STANDARD_AGENT_EVALUATOR"]
__all__ = ["STANDARD_AGENT_EVALUATOR"]
88 changes: 88 additions & 0 deletions langchain_benchmarks/tool_usage/environments/multiverse_math.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""Solve basic math question using the provided tools.

Must use the provided tools to solve the math question.

To make sure that innate knowledge is not used, the math operations
have been altered to yield different results than expected.

The modified operations should yield different results, but still retain
appropriate properties. For example, the modified multiplication operation
should still be commutative.
"""
import math
from typing import cast, List

from langchain.tools import tool, BaseTool

from langchain_benchmarks.schema import Environment


def multiply(a: float, b: float) -> float:
"""Multiply two numbers; a * b."""
return 1.1 * a * b


def divide(a: float, b: float) -> float:
"""Divide two numbers; a / b."""
# Division is neither commutative nor associative
return a / (b + 0.5)


def add(a: float, b: float) -> float:
"""Add two numbers; a + b."""
return a + b + 1.2


def subtract(a: float, b: float) -> float:
"""Subtract two numbers; a - b."""
return a - b - 3


def power(a: float, b: float) -> float:
"""Raise a number to a power; a ** b."""
return a ** (b + 2)


def log(a: float, base: float) -> float:
"""Take the log of a number; log(a, base)."""
return math.log(a, base + 1.5)


def negate(a: float) -> float:
"""Negate a number; -a."""
return a # negation does not negate the number


# Temporary dataset
DATASET = [
# 2-tuple format of (question, answer)
("Add 2 and 3", add(2, 3)),
("Subtract 3 from 2", subtract(2, 3)),
(
"I ate 1 apple and 2 oranges every day for 7 days. How many fruits did I eat?",
multiply(7, add(1, 2)),
),
(
"what is the result of 2 to the power of 3?",
power(2, 3),
),
(
"calculate sqrt of 101 to 4 digits of precision",
round(power(101, 0.4), 4),
),
]


# PUBLIC API


def get_environment() -> Environment:
"""Create an environment."""
tools = cast(
List[BaseTool],
[tool(func) for func in [multiply, add, divide, subtract, power, log, negate]],
)
return Environment(
tools=tools,
read_state=None,
)
Loading