Add additional tasks, re-org repo a bit (#26)

* Push registry to top level * Rename environments to tasks * Tool usage tasks can create an environment; an environment can be associated with a state that can be read * Add additional tasks
langchain-ai · Nov 20, 2023 · 5f2ce54 · 5f2ce54
1 parent 65aeb98
commit 5f2ce54
Show file tree

Hide file tree

Showing 13 changed files with 599 additions and 196 deletions.
diff --git a/docs/source/notebooks/tool_usage.ipynb b/docs/source/notebooks/tool_usage.ipynb
diff --git a/langchain_benchmarks/__init__.py b/langchain_benchmarks/__init__.py
@@ -1,7 +1,8 @@
+from langchain_benchmarks.registration import registry
 from langchain_benchmarks.utils._langsmith import (
     clone_public_dataset,
     download_public_dataset,
 )
 
 # Please keep this list sorted!
-__all__ = ["clone_public_dataset", "download_public_dataset"]
+__all__ = ["clone_public_dataset", "download_public_dataset", "registry"]
diff --git a/langchain_benchmarks/registration.py b/langchain_benchmarks/registration.py
@@ -0,0 +1,191 @@
+"""Registry of environments for ease of access."""
+import dataclasses
+from typing import Sequence, Union
+
+from tabulate import tabulate
+
+from langchain_benchmarks.schema import Task
+from langchain_benchmarks.tool_usage.environments import (
+    relational_data,
+    type_writer,
+    type_writer_26_funcs,
+    multiverse_math,
+)
+
+
+@dataclasses.dataclass(frozen=True)
+class Registry:
+    tasks: Sequence[Task]
+
+    def get_task(self, name_or_id: Union[int, str]) -> Task:
+        """Get the environment with the given name."""
+        for env in self.tasks:
+            if env.name == name_or_id or env.id == name_or_id:
+                return env
+        raise ValueError(f"Unknown task {name_or_id}")
+
+    def __post_init__(self) -> None:
+        """Validate that all the tasks have unique names and IDs."""
+        seen_names = set()
+        seen_ids = set()
+        for task in self.tasks:
+            if task.name in seen_names:
+                raise ValueError(
+                    f"Duplicate task name {task.name}. " f"Task names must be unique."
+                )
+            seen_names.add(task.name)
+            if task.id in seen_ids:
+                raise ValueError(
+                    f"Duplicate task ID {task.id}. " f"Task IDs must be unique."
+                )
+
+    def _repr_html_(self) -> str:
+        """Return a HTML representation of the registry."""
+        headers = [
+            "ID",
+            "Name",
+            "Dataset ID",
+            "Description",
+        ]
+        table = [
+            [
+                env.id,
+                env.name,
+                env.dataset_id,
+                env.description,
+            ]
+            for env in self.tasks
+        ]
+        return tabulate(table, headers=headers, tablefmt="html")
+
+    def __getitem__(self, key: Union[int, str]) -> Task:
+        """Get an environment from the registry."""
+        if isinstance(key, slice):
+            raise NotImplementedError("Slicing is not supported.")
+        elif isinstance(key, (int, str)):
+            # If key is an integer, return the corresponding environment
+            return self.get_task(key)
+        else:
+            raise TypeError("Key must be an integer or a slice.")
+
+
+# Using lower case naming to make a bit prettier API when used in a notebook
+registry = Registry(
+    tasks=[
+        Task(
+            id=0,
+            name="Tool Usage - Relational Data",
+            dataset_id=relational_data.DATASET_ID,
+            create_environment=relational_data.get_environment,
+            instructions=(
+                """\
+Please answer the user's question by using the tools provided. Do not guess the \
+answer. Keep in mind that entities like users,foods and locations have both a \
+name and an ID, which are not the same."""
+            ),
+            description=(
+                """\
+Environment with fake data about users and their locations and favorite foods.
+
+The environment provides a set of tools that can be used to query the data.
+
+The objective of this task is to evaluate the ability to use the provided tools \
+to answer questions about relational data.
+
+The dataset contains 21 examples of varying difficulty. The difficulty is measured \
+by the number of tools that need to be used to answer the question.
+
+Each example is composed of a question, a reference answer, and \
+information about the sequence in which tools should be used to answer \
+the question.
+
+Success is measured by the ability to answer the question correctly, and efficiently.
+"""
+            ),
+        ),
+        Task(
+            id=1,
+            name="Tool Usage - Typewriter (1 func)",
+            dataset_id="placeholder",
+            create_environment=type_writer.get_environment,
+            instructions=(
+                "Repeat the given string by using the provided tools. "
+                "Do not write anything else or provide any explanations. "
+                "For example, if the string is 'abc', you must invoke the tools "
+                "'a', 'b', and 'c' in that order. "
+                "Please invoke the function with a single letter at a time."
+            ),
+            description=(
+                """\
+Environment with a single function that accepts a single letter as input, and \
+"prints" it on a piece of paper.
+
+The objective of this task is to evaluate the ability to use the provided \
+ tools to repeat a given input string.
+
+For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \
+in that order.
+
+The dataset includes examples of varying difficulty. The difficulty is measured \
+by the length of the string.
+"""
+            ),
+        ),
+        Task(
+            id=2,
+            name="Tool Usage - Typewriter",
+            dataset_id="placeholder",
+            create_environment=type_writer_26_funcs.get_environment,
+            instructions=(
+                "Repeat the given string by using the provided tools. "
+                "Do not write anything else or provide any explanations. "
+                "For example, if the string is 'abc', you must invoke the tools "
+                "'a', 'b', and 'c' in that order. "
+                "Please invoke the functions without any arguments."
+            ),
+            description=(
+                """\
+Environment with 26 functions each representing a letter of the alphabet.
+
+In this variation of the typewriter task, there are 26 parameterless functions, where \
+each function represents a letter of the alphabet (instead of a single function that \
+takes a letter as an argument).
+
+The object is to evaluate the ability of use the functions to repeat the given string.
+
+For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \
+in that order.
+
+The dataset includes examples of varying difficulty. The difficulty is measured \
+by the length of the string.
+"""
+            ),
+        ),
+        Task(
+            id=3,
+            name="Multiverse Math",
+            dataset_id="placeholder",
+            create_environment=multiverse_math.get_environment,
+            instructions=(
+                "You are requested to solve math questions in an alternate "
+                "mathematical universe. The rules of association, commutativity, "
+                "and distributivity still apply, but the operations have been "
+                "altered to yield different results than expected. Solve the "
+                "given math questions using the provided tools. "
+                "Do not guess the answer."
+            ),
+            description=(
+                """\
+An environment that contains a few basic math operations, but with altered results.
+
+For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \
+The basic operations retain some basic properties, such as commutativity, \
+associativity, and distributivity; however, the results are different than expected.
+
+The objective of this task is to evaluate the ability to use the provided tools to \
+solve simple math questions and ignore any innate knowledge about math.
+"""
+            ),
+        ),
+    ]
+)
diff --git a/langchain_benchmarks/schema.py b/langchain_benchmarks/schema.py
@@ -0,0 +1,62 @@
+"""Schema for the Langchain Benchmarks."""
+import dataclasses
+from typing import List, Callable, Any, Optional
+
+from langchain.tools import BaseTool
+from tabulate import tabulate
+
+
+@dataclasses.dataclass(frozen=True)
+class Environment:
+    """An instance of an environment for tool usage."""
+
+    tools: List[BaseTool]
+    """The tools that can be used in the environment."""
+
+    read_state: Optional[Callable[[], Any]] = None
+    """A function that returns the current state of the environment."""
+
+
+@dataclasses.dataclass(frozen=True)
+class Task:
+    """A definition for a task."""
+
+    id: int
+    """The ID of the environment."""
+    name: str
+    """The name of the environment."""
+
+    dataset_id: str
+    """The ID of the langsmith public dataset.
+    
+    This dataset contains expected inputs/outputs for the environment, and
+    can be used to evaluate the performance of a model/agent etc.
+    """
+
+    create_environment: Callable[
+        [], Environment
+    ]  # Specialized for tool usage; refactor potentially
+    """Factory that returns an environment."""
+
+    description: str
+    """Description of the task for a data science practitioner.
+    
+    This can contain information about the task, the dataset, the tools available
+    etc.
+    """
+
+    instructions: str
+    """Instructions for the agent/chain/llm."""
+
+    def _repr_html_(self) -> str:
+        """Return an HTML representation of the environment."""
+        table = [
+            ["ID", self.id],
+            ["Name", self.name],
+            ["Dataset ID", self.dataset_id],
+            ["Description", self.description[:100] + "..."],
+        ]
+        return tabulate(
+            table,
+            tablefmt="html",
+        )
diff --git a/langchain_benchmarks/tool_usage/README.md b/langchain_benchmarks/tool_usage/README.md
@@ -1,19 +1,4 @@
-# Testing Agents
+# Tool usage
 
-This directory contains environments that can be used to test agent's ability
-to use tools and make decisions.
-
-## Environments
-
-Environments are named in the style of e[env_number]_[name].py.
-
-### e01_alpha
-
-* Consists of 3 relational tables of users, locations and foods.
-* Defines a set of tools that can be used these tables.
-* Agent should use the given tools to answer questions.
-
-## Running Evaluation
-
-Please refer to the following example to see how to set up and run evaluation
-for agents using [LangSmith](https://github.com/langchain-ai/langsmith-cookbook/blob/main/testing-examples/agent_steps/evaluating_agents.ipynb).
+This sub-package includes code to help test how well tools can be used to make
+decisions.
diff --git a/langchain_benchmarks/tool_usage/__init__.py b/langchain_benchmarks/tool_usage/__init__.py
@@ -1,6 +1,5 @@
 """Package for helping to evaluate agent runs."""
 from langchain_benchmarks.tool_usage.evaluators import STANDARD_AGENT_EVALUATOR
-from langchain_benchmarks.tool_usage.registration import registry
 
 # Please keep this list sorted!
-__all__ = ["registry", "STANDARD_AGENT_EVALUATOR"]
+__all__ = ["STANDARD_AGENT_EVALUATOR"]
diff --git a/langchain_benchmarks/tool_usage/environments/multiverse_math.py b/langchain_benchmarks/tool_usage/environments/multiverse_math.py
@@ -0,0 +1,88 @@
+"""Solve basic math question using the provided tools.
+
+Must use the provided tools to solve the math question.
+
+To make sure that innate knowledge is not used, the math operations
+have been altered to yield different results than expected.
+
+The modified operations should yield different results, but still retain
+appropriate properties. For example, the modified multiplication operation
+should still be commutative.
+"""
+import math
+from typing import cast, List
+
+from langchain.tools import tool, BaseTool
+
+from langchain_benchmarks.schema import Environment
+
+
+def multiply(a: float, b: float) -> float:
+    """Multiply two numbers; a * b."""
+    return 1.1 * a * b
+
+
+def divide(a: float, b: float) -> float:
+    """Divide two numbers; a / b."""
+    # Division is neither commutative nor associative
+    return a / (b + 0.5)
+
+
+def add(a: float, b: float) -> float:
+    """Add two numbers; a + b."""
+    return a + b + 1.2
+
+
+def subtract(a: float, b: float) -> float:
+    """Subtract two numbers; a - b."""
+    return a - b - 3
+
+
+def power(a: float, b: float) -> float:
+    """Raise a number to a power; a ** b."""
+    return a ** (b + 2)
+
+
+def log(a: float, base: float) -> float:
+    """Take the log of a number; log(a, base)."""
+    return math.log(a, base + 1.5)
+
+
+def negate(a: float) -> float:
+    """Negate a number; -a."""
+    return a  # negation does not negate the number
+
+
+# Temporary dataset
+DATASET = [
+    # 2-tuple format of (question, answer)
+    ("Add 2 and 3", add(2, 3)),
+    ("Subtract 3 from 2", subtract(2, 3)),
+    (
+        "I ate 1 apple and 2 oranges every day for 7 days. How many fruits did I eat?",
+        multiply(7, add(1, 2)),
+    ),
+    (
+        "what is the result of 2 to the power of 3?",
+        power(2, 3),
+    ),
+    (
+        "calculate sqrt of 101 to 4 digits of precision",
+        round(power(101, 0.4), 4),
+    ),
+]
+
+
+# PUBLIC API
+
+
+def get_environment() -> Environment:
+    """Create an environment."""
+    tools = cast(
+        List[BaseTool],
+        [tool(func) for func in [multiply, add, divide, subtract, power, log, negate]],
+    )
+    return Environment(
+        tools=tools,
+        read_state=None,
+    )