Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor for new registry #32

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 112 additions & 48 deletions docs/source/notebooks/tool_usage.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion langchain_benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from langchain_benchmarks.registration import registry
from langchain_benchmarks.utils._langsmith import (
clone_public_dataset,
download_public_dataset,
)

# Please keep this list sorted!
__all__ = ["clone_public_dataset", "download_public_dataset"]
__all__ = ["clone_public_dataset", "download_public_dataset", "registry"]
74 changes: 74 additions & 0 deletions langchain_benchmarks/extraction/email_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from enum import Enum
from typing import Optional, List

from langchain.smith import RunEvalConfig
from pydantic import BaseModel, Field

from langchain_benchmarks.schema import ExtractionTask


class ToneEnum(str, Enum):
"""The tone of the email."""

positive = "positive"
negative = "negative"


class Email(BaseModel):
"""Relevant information about an email."""

sender: Optional[str] = Field(None, description="The sender's name, if available")
sender_phone_number: Optional[str] = Field(
None, description="The sender's phone number, if available"
)
sender_address: Optional[str] = Field(
None, description="The sender's address, if available"
)
action_items: List[str] = Field(
..., description="A list of action items requested by the email"
)
topic: str = Field(
..., description="High level description of what the email is about"
)
tone: ToneEnum = Field(..., description="The tone of the email.")


def get_eval_config(eval_llm: BaseModel) -> RunEvalConfig:
"""Get the evaluation configuration for the email task."""
return RunEvalConfig(
evaluators=[
RunEvalConfig.LabeledScoreString(
criteria={
"accuracy": """
Score 1: The answer is incorrect and unrelated to the question or reference document.
Score 3: The answer is partially correct but has more than one omission or major errors.
Score 5: The answer is mostly correct but has more than one omission or major error.
Score 7: The answer is mostly correct but has at most one omission or major error.
Score 9: The answer is mostly correct with no omissions and only minor errors, and aligns with the reference document.
Score 10: The answer is correct, complete, and aligns with the reference document. Extra information is acceptable if it is sensible.

If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct.
If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct and not be penalized.
""" # noqa
},
llm=eval_llm,
normalize_by=10.0,
),
],
)


EMAIL_EXTRACTION_TASK = ExtractionTask(
name="Email Extraction",
dataset_id="https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d",
model=Email,
description="""\
A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, \
as well as a script for initial extraction and formatting of other emails from \
an arbitrary .mbox file like the one exported by Gmail.

Some additional cleanup of the data was done by hand after the initial pass.

See https://github.com/jacoblee93/oss-model-extraction-evals.
""",
)
6 changes: 4 additions & 2 deletions langchain_benchmarks/rag/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""RAG environments."""
from langchain_benchmarks.rag.evaluators import RAG_EVALUATION
from langchain_benchmarks.rag.registration import registry
from langchain_benchmarks.rag.environments.langchain_docs.task import (
LANGCHAIN_DOCS_TASK,
)

# Please keep this list sorted!
__all__ = ["registry", "RAG_EVALUATION"]
__all__ = ["LANGCHAIN_DOCS_TASK", "RAG_EVALUATION"]
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
DATASET_ID = "452ccafc-18e1-4314-885b-edd735f17b9d" # ID of public LangChain Docs dataset
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
ARCH_FACTORIES,
)

__all__ = ["ARCH_FACTORIES"]
__all__ = ["ARCH_FACTORIES"]
28 changes: 28 additions & 0 deletions langchain_benchmarks/rag/environments/langchain_docs/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from langchain_benchmarks.rag.environments.langchain_docs import (
langchain_docs_retriever,
architectures,
)
from langchain_benchmarks.schema import RetrievalTask

DATASET_ID = (
"452ccafc-18e1-4314-885b-edd735f17b9d" # ID of public LangChain Docs dataset
)

LANGCHAIN_DOCS_TASK = RetrievalTask(
name="LangChain Docs Q&A",
dataset_id=DATASET_ID,
retriever_factories=langchain_docs_retriever.RETRIEVER_FACTORIES,
architecture_factories=architectures.ARCH_FACTORIES,
description=(
"""\
Questions and answers based on a snapshot of the LangChain python docs.

The environment provides the documents and the retriever information.

Each example is composed of a question and reference answer.

Success is measured based on the accuracy of the answer relative to the reference answer.
We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
""" # noqa: E501
),
)
55 changes: 0 additions & 55 deletions langchain_benchmarks/rag/registration.py

This file was deleted.

23 changes: 23 additions & 0 deletions langchain_benchmarks/registration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Registry of environments for ease of access."""

from langchain_benchmarks.extraction import email_task
from langchain_benchmarks.schema import Registry
from langchain_benchmarks.rag import LANGCHAIN_DOCS_TASK
from langchain_benchmarks.tool_usage import (
type_writer_26_funcs,
type_writer,
relational_data,
multiverse_math,
)

# Using lower case naming to make a bit prettier API when used in a notebook
registry = Registry(
tasks=[
type_writer.TYPE_WRITER_TASK,
type_writer_26_funcs.TYPE_WRITER_26_FUNCS_TASK,
relational_data.RELATIONAL_DATA_TASK,
multiverse_math.MULTIVERSE_MATH,
email_task.EMAIL_EXTRACTION_TASK,
LANGCHAIN_DOCS_TASK,
]
)
151 changes: 151 additions & 0 deletions langchain_benchmarks/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""Schema for the Langchain Benchmarks."""
import dataclasses
from typing import List, Callable, Any, Optional, Type, Union, Dict

from langchain.schema import BaseRetriever
from langchain.schema.embeddings import Embeddings
from langchain.tools import BaseTool
from pydantic import BaseModel
from tabulate import tabulate


@dataclasses.dataclass(frozen=True)
class ToolUsageEnvironment:
"""An instance of an environment for tool usage."""

tools: List[BaseTool]
"""The tools that can be used in the environment."""

read_state: Optional[Callable[[], Any]] = None
"""A function that returns the current state of the environment."""


@dataclasses.dataclass(frozen=True)
class BaseTask:
"""A definition of a task."""

name: str
"""The name of the environment."""

dataset_id: str
"""The ID of the langsmith public dataset.

This dataset contains expected inputs/outputs for the environment, and
can be used to evaluate the performance of a model/agent etc.
"""

description: str
"""Description of the task for a data science practitioner.

This can contain information about the task, the dataset, the tools available
etc.
"""

@property
def _table(self) -> List[List[str]]:
"""Return a table representation of the environment."""
return [
["Name", self.name],
["Type", self.__class__.__name__],
["Dataset ID", self.dataset_id],
["Description", self.description],
]

def _repr_html_(self) -> str:
"""Return an HTML representation of the environment."""
return tabulate(
self._table,
tablefmt="html",
)


@dataclasses.dataclass(frozen=True)
class ToolUsageTask(BaseTask):
"""A definition for a task."""

create_environment: Callable[[], ToolUsageEnvironment]
"""Factory that returns an environment."""

instructions: str
"""Instructions for the agent/chain/llm."""


@dataclasses.dataclass(frozen=True)
class ExtractionTask(BaseTask):
"""A definition for an extraction task."""

model: Type[BaseModel]
"""Get the model for the task."""


@dataclasses.dataclass(frozen=True)
class RetrievalTask(BaseTask):
retriever_factories: Dict[str, Callable[[Embeddings], BaseRetriever]] # noqa: F821
"""Factories that index the docs using the specified strategy."""
architecture_factories: Dict[str, Callable[[Embeddings], BaseRetriever]] # noqa: F821
"""Factories methods that help build some off-the-shelf architectures。"""

@property
def _table(self) -> List[List[str]]:
"""Get information about the task."""
table = super()._table
return table + [
["Retriever Factories", ", ".join(self.retriever_factories.keys())],
["Architecture Factories", ", ".join(self.architecture_factories.keys())],
]


@dataclasses.dataclass(frozen=False)
class Registry:
tasks: List[BaseTask]

def get_task(self, name_or_id: Union[int, str]) -> BaseTask:
"""Get the task with the given name or ID."""
if isinstance(name_or_id, int):
return self.tasks[name_or_id]

for env in self.tasks:
if env.name == name_or_id:
return env
raise ValueError(f"Unknown task {name_or_id}")

def __post_init__(self) -> None:
"""Validate that all the tasks have unique names and IDs."""
seen_names = set()
for task in self.tasks:
if task.name in seen_names:
raise ValueError(
f"Duplicate task name {task.name}. " f"Task names must be unique."
)

def _repr_html_(self) -> str:
"""Return an HTML representation of the registry."""
headers = [
"Name",
"Dataset ID",
"Description",
]
table = [
[
env.name,
env.dataset_id,
env.description,
]
for env in self.tasks
]
return tabulate(table, headers=headers, tablefmt="html")

def __getitem__(self, key: Union[int, str]) -> BaseTask:
"""Get an environment from the registry."""
if isinstance(key, slice):
raise NotImplementedError("Slicing is not supported.")
elif isinstance(key, (int, str)):
# If key is an integer, return the corresponding environment
return self.get_task(key)
else:
raise TypeError("Key must be an integer or a slice.")

def add(self, task: BaseTask) -> None:
if not isinstance(task, BaseTask):
raise TypeError("Only tasks can be added to the registry.")
self.tasks.append(task)
21 changes: 3 additions & 18 deletions langchain_benchmarks/tool_usage/README.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,4 @@
# Testing Agents
# Tool usage

This directory contains environments that can be used to test agent's ability
to use tools and make decisions.

## Environments

Environments are named in the style of e[env_number]_[name].py.

### e01_alpha

* Consists of 3 relational tables of users, locations and foods.
* Defines a set of tools that can be used these tables.
* Agent should use the given tools to answer questions.

## Running Evaluation

Please refer to the following example to see how to set up and run evaluation
for agents using [LangSmith](https://github.com/langchain-ai/langsmith-cookbook/blob/main/testing-examples/agent_steps/evaluating_agents.ipynb).
This sub-package includes code to help test how well tools can be used to make
decisions.
3 changes: 1 addition & 2 deletions langchain_benchmarks/tool_usage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Package for helping to evaluate agent runs."""
from langchain_benchmarks.tool_usage.evaluators import STANDARD_AGENT_EVALUATOR
from langchain_benchmarks.tool_usage.registration import registry

# Please keep this list sorted!
__all__ = ["registry", "STANDARD_AGENT_EVALUATOR"]
__all__ = ["STANDARD_AGENT_EVALUATOR"]
Loading
Loading