langchain-ai · eyurtsev · Nov 20, 2023 · Nov 20, 2023 · Nov 20, 2023 · Nov 20, 2023
diff --git a/docs/source/notebooks/tool_usage.ipynb b/docs/source/notebooks/tool_usage.ipynb
diff --git a/langchain_benchmarks/__init__.py b/langchain_benchmarks/__init__.py
@@ -1,7 +1,8 @@
+from langchain_benchmarks.registration import registry
 from langchain_benchmarks.utils._langsmith import (
     clone_public_dataset,
     download_public_dataset,
 )
 
 # Please keep this list sorted!
-__all__ = ["clone_public_dataset", "download_public_dataset"]
+__all__ = ["clone_public_dataset", "download_public_dataset", "registry"]
diff --git a/...marks/tool_usage/environments/__init__.py → langchain_benchmarks/extraction/__init__.py b/...marks/tool_usage/environments/__init__.py → langchain_benchmarks/extraction/__init__.py
diff --git a/langchain_benchmarks/extraction/email_task.py b/langchain_benchmarks/extraction/email_task.py
@@ -0,0 +1,74 @@
+from enum import Enum
+from typing import Optional, List
+
+from langchain.smith import RunEvalConfig
+from pydantic import BaseModel, Field
+
+from langchain_benchmarks.schema import ExtractionTask
+
+
+class ToneEnum(str, Enum):
+    """The tone of the email."""
+
+    positive = "positive"
+    negative = "negative"
+
+
+class Email(BaseModel):
+    """Relevant information about an email."""
+
+    sender: Optional[str] = Field(None, description="The sender's name, if available")
+    sender_phone_number: Optional[str] = Field(
+        None, description="The sender's phone number, if available"
+    )
+    sender_address: Optional[str] = Field(
+        None, description="The sender's address, if available"
+    )
+    action_items: List[str] = Field(
+        ..., description="A list of action items requested by the email"
+    )
+    topic: str = Field(
+        ..., description="High level description of what the email is about"
+    )
+    tone: ToneEnum = Field(..., description="The tone of the email.")
+
+
+def get_eval_config(eval_llm: BaseModel) -> RunEvalConfig:
+    """Get the evaluation configuration for the email task."""
+    return RunEvalConfig(
+        evaluators=[
+            RunEvalConfig.LabeledScoreString(
+                criteria={
+                    "accuracy": """
+    Score 1: The answer is incorrect and unrelated to the question or reference document.
+    Score 3: The answer is partially correct but has more than one omission or major errors.
+    Score 5: The answer is mostly correct but has more than one omission or major error.
+    Score 7: The answer is mostly correct but has at most one omission or major error.
+    Score 9: The answer is mostly correct with no omissions and only minor errors, and aligns with the reference document.
+    Score 10: The answer is correct, complete, and aligns with the reference document. Extra information is acceptable if it is sensible.
+
+    If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct.
+    If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct and not be penalized.
+    """  # noqa
+                },
+                llm=eval_llm,
+                normalize_by=10.0,
+            ),
+        ],
+    )
+
+
+EMAIL_EXTRACTION_TASK = ExtractionTask(
+    name="Email Extraction",
+    dataset_id="https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d",
+    model=Email,
+    description="""\
+A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, \
+as well as a script for initial extraction and formatting of other emails from \
+an arbitrary .mbox file like the one exported by Gmail.
+
+Some additional cleanup of the data was done by hand after the initial pass.
+
+See https://github.com/jacoblee93/oss-model-extraction-evals.
+    """,
+)
diff --git a/langchain_benchmarks/rag/__init__.py b/langchain_benchmarks/rag/__init__.py
@@ -1,6 +1,8 @@
 """RAG environments."""
 from langchain_benchmarks.rag.evaluators import RAG_EVALUATION
-from langchain_benchmarks.rag.registration import registry
+from langchain_benchmarks.rag.environments.langchain_docs.task import (
+    LANGCHAIN_DOCS_TASK,
+)
 
 # Please keep this list sorted!
-__all__ = ["registry", "RAG_EVALUATION"]
+__all__ = ["LANGCHAIN_DOCS_TASK", "RAG_EVALUATION"]
diff --git a/langchain_benchmarks/rag/environments/langchain_docs/__init__.py b/langchain_benchmarks/rag/environments/langchain_docs/__init__.py
@@ -1 +0,0 @@
-DATASET_ID = "452ccafc-18e1-4314-885b-edd735f17b9d"  # ID of public LangChain Docs dataset

diff --git a/langchain_benchmarks/rag/environments/langchain_docs/architectures/__init__.py b/langchain_benchmarks/rag/environments/langchain_docs/architectures/__init__.py
@@ -2,4 +2,4 @@
     ARCH_FACTORIES,
 )
 
-__all__ = ["ARCH_FACTORIES"]
+__all__ = ["ARCH_FACTORIES"]
diff --git a/langchain_benchmarks/rag/environments/langchain_docs/task.py b/langchain_benchmarks/rag/environments/langchain_docs/task.py
@@ -0,0 +1,28 @@
+from langchain_benchmarks.rag.environments.langchain_docs import (
+    langchain_docs_retriever,
+    architectures,
+)
+from langchain_benchmarks.schema import RetrievalTask
+
+DATASET_ID = (
+    "452ccafc-18e1-4314-885b-edd735f17b9d"  # ID of public LangChain Docs dataset
+)
+
+LANGCHAIN_DOCS_TASK = RetrievalTask(
+    name="LangChain Docs Q&A",
+    dataset_id=DATASET_ID,
+    retriever_factories=langchain_docs_retriever.RETRIEVER_FACTORIES,
+    architecture_factories=architectures.ARCH_FACTORIES,
+    description=(
+        """\
+Questions and answers based on a snapshot of the LangChain python docs.
+
+The environment provides the documents and the retriever information.
+
+Each example is composed of a question and reference answer.
+
+Success is measured based on the accuracy of the answer relative to the reference answer.
+We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
+"""  # noqa: E501
+    ),
+)
diff --git a/langchain_benchmarks/rag/registration.py b/langchain_benchmarks/rag/registration.py
diff --git a/langchain_benchmarks/registration.py b/langchain_benchmarks/registration.py
@@ -0,0 +1,23 @@
+"""Registry of environments for ease of access."""
+
+from langchain_benchmarks.extraction import email_task
+from langchain_benchmarks.schema import Registry
+from langchain_benchmarks.rag import LANGCHAIN_DOCS_TASK
+from langchain_benchmarks.tool_usage import (
+    type_writer_26_funcs,
+    type_writer,
+    relational_data,
+    multiverse_math,
+)
+
+# Using lower case naming to make a bit prettier API when used in a notebook
+registry = Registry(
+    tasks=[
+        type_writer.TYPE_WRITER_TASK,
+        type_writer_26_funcs.TYPE_WRITER_26_FUNCS_TASK,
+        relational_data.RELATIONAL_DATA_TASK,
+        multiverse_math.MULTIVERSE_MATH,
+        email_task.EMAIL_EXTRACTION_TASK,
+        LANGCHAIN_DOCS_TASK,
+    ]
+)
diff --git a/langchain_benchmarks/schema.py b/langchain_benchmarks/schema.py
@@ -0,0 +1,151 @@
+"""Schema for the Langchain Benchmarks."""
+import dataclasses
+from typing import List, Callable, Any, Optional, Type, Union, Dict
+
+from langchain.schema import BaseRetriever
+from langchain.schema.embeddings import Embeddings
+from langchain.tools import BaseTool
+from pydantic import BaseModel
+from tabulate import tabulate
+
+
+@dataclasses.dataclass(frozen=True)
+class ToolUsageEnvironment:
+    """An instance of an environment for tool usage."""
+
+    tools: List[BaseTool]
+    """The tools that can be used in the environment."""
+
+    read_state: Optional[Callable[[], Any]] = None
+    """A function that returns the current state of the environment."""
+
+
+@dataclasses.dataclass(frozen=True)
+class BaseTask:
+    """A definition of a task."""
+
+    name: str
+    """The name of the environment."""
+
+    dataset_id: str
+    """The ID of the langsmith public dataset.
+
+    This dataset contains expected inputs/outputs for the environment, and
+    can be used to evaluate the performance of a model/agent etc.
+    """
+
+    description: str
+    """Description of the task for a data science practitioner.
+
+    This can contain information about the task, the dataset, the tools available
+    etc.
+    """
+
+    @property
+    def _table(self) -> List[List[str]]:
+        """Return a table representation of the environment."""
+        return [
+            ["Name", self.name],
+            ["Type", self.__class__.__name__],
+            ["Dataset ID", self.dataset_id],
+            ["Description", self.description],
+        ]
+
+    def _repr_html_(self) -> str:
+        """Return an HTML representation of the environment."""
+        return tabulate(
+            self._table,
+            tablefmt="html",
+        )
+
+
+@dataclasses.dataclass(frozen=True)
+class ToolUsageTask(BaseTask):
+    """A definition for a task."""
+
+    create_environment: Callable[[], ToolUsageEnvironment]
+    """Factory that returns an environment."""
+
+    instructions: str
+    """Instructions for the agent/chain/llm."""
+
+
+@dataclasses.dataclass(frozen=True)
+class ExtractionTask(BaseTask):
+    """A definition for an extraction task."""
+
+    model: Type[BaseModel]
+    """Get the model for the task."""
+
+
+@dataclasses.dataclass(frozen=True)
+class RetrievalTask(BaseTask):
+    retriever_factories: Dict[str, Callable[[Embeddings], BaseRetriever]]  # noqa: F821
+    """Factories that index the docs using the specified strategy."""
+    architecture_factories: Dict[str, Callable[[Embeddings], BaseRetriever]]  # noqa: F821
+    """Factories methods that help build some off-the-shelf architectures。"""
+
+    @property
+    def _table(self) -> List[List[str]]:
+        """Get information about the task."""
+        table = super()._table
+        return table + [
+            ["Retriever Factories", ", ".join(self.retriever_factories.keys())],
+            ["Architecture Factories", ", ".join(self.architecture_factories.keys())],
+        ]
+
+
+@dataclasses.dataclass(frozen=False)
+class Registry:
+    tasks: List[BaseTask]
+
+    def get_task(self, name_or_id: Union[int, str]) -> BaseTask:
+        """Get the task with the given name or ID."""
+        if isinstance(name_or_id, int):
+            return self.tasks[name_or_id]
+
+        for env in self.tasks:
+            if env.name == name_or_id:
+                return env
+        raise ValueError(f"Unknown task {name_or_id}")
+
+    def __post_init__(self) -> None:
+        """Validate that all the tasks have unique names and IDs."""
+        seen_names = set()
+        for task in self.tasks:
+            if task.name in seen_names:
+                raise ValueError(
+                    f"Duplicate task name {task.name}. " f"Task names must be unique."
+                )
+
+    def _repr_html_(self) -> str:
+        """Return an HTML representation of the registry."""
+        headers = [
+            "Name",
+            "Dataset ID",
+            "Description",
+        ]
+        table = [
+            [
+                env.name,
+                env.dataset_id,
+                env.description,
+            ]
+            for env in self.tasks
+        ]
+        return tabulate(table, headers=headers, tablefmt="html")
+
+    def __getitem__(self, key: Union[int, str]) -> BaseTask:
+        """Get an environment from the registry."""
+        if isinstance(key, slice):
+            raise NotImplementedError("Slicing is not supported.")
+        elif isinstance(key, (int, str)):
+            # If key is an integer, return the corresponding environment
+            return self.get_task(key)
+        else:
+            raise TypeError("Key must be an integer or a slice.")
+
+    def add(self, task: BaseTask) -> None:
+        if not isinstance(task, BaseTask):
+            raise TypeError("Only tasks can be added to the registry.")
+        self.tasks.append(task)
diff --git a/langchain_benchmarks/tool_usage/README.md b/langchain_benchmarks/tool_usage/README.md
@@ -1,19 +1,4 @@
-# Testing Agents
+# Tool usage
 
-This directory contains environments that can be used to test agent's ability
-to use tools and make decisions.
-
-## Environments
-
-Environments are named in the style of e[env_number]_[name].py.
-
-### e01_alpha
-
-* Consists of 3 relational tables of users, locations and foods.
-* Defines a set of tools that can be used these tables.
-* Agent should use the given tools to answer questions.
-
-## Running Evaluation
-
-Please refer to the following example to see how to set up and run evaluation
-for agents using [LangSmith](https://github.com/langchain-ai/langsmith-cookbook/blob/main/testing-examples/agent_steps/evaluating_agents.ipynb).
+This sub-package includes code to help test how well tools can be used to make
+decisions.
diff --git a/langchain_benchmarks/tool_usage/__init__.py b/langchain_benchmarks/tool_usage/__init__.py
@@ -1,6 +1,5 @@
 """Package for helping to evaluate agent runs."""
 from langchain_benchmarks.tool_usage.evaluators import STANDARD_AGENT_EVALUATOR
-from langchain_benchmarks.tool_usage.registration import registry
 
 # Please keep this list sorted!
-__all__ = ["registry", "STANDARD_AGENT_EVALUATOR"]
+__all__ = ["STANDARD_AGENT_EVALUATOR"]
Original file line number	Diff line number	Diff line change
		@@ -1 +0,0 @@
		DATASET_ID = "452ccafc-18e1-4314-885b-edd735f17b9d" # ID of public LangChain Docs dataset