Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add langsmith util to clone dataset #19

Merged
merged 4 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions langchain_benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .utils._langsmith import clone_dataset

# Please keep this list sorted!
__all__ = ["clone_dataset"]
Empty file.
54 changes: 54 additions & 0 deletions langchain_benchmarks/utils/_langsmith.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Copy the public dataset to your own langsmith tenant."""
from langsmith import Client
from langsmith.utils import LangSmithNotFoundError
from tqdm import tqdm

# PUBLIC API


def clone_dataset(
public_dataset_token: str,
dataset_name: str,
) -> None:
"""Clone a public dataset to your own langsmith tenant.

This operation is idempotent. If you already have a dataset with the given name,
this function will do nothing.

Args:
public_dataset_token (str): The token of the public dataset to clone.
dataset_name (str): The name of the dataset to create in your tenant.
"""
client = Client()

try:
client.read_dataset(dataset_name=dataset_name)
except LangSmithNotFoundError:
pass
else:
print(f"Dataset {dataset_name} already exists. Skipping.")
return

# Fetch examples first
examples = tqdm(list(client.list_shared_examples(public_dataset_token)))
print("Finished fetching examples. Creating dataset...")
dataset = client.create_dataset(dataset_name=dataset_name)
try:
client.create_examples(
inputs=[e.inputs for e in examples],
outputs=[e.outputs for e in examples],
dataset_id=dataset.id,
)
except BaseException as e:
# Let's not do automatic clean up for now in case there might be
# some other reasons why create_examples fails (i.e., not network issue or
# keyboard interrupt).
# The risk is that this is an existing dataset that has valid examples
# populated from another source so we don't want to delete it.
print(
f"An error occurred while creating dataset {dataset_name}. "
"You should delete it manually."
)
raise e

print("Done creating dataset.")
22 changes: 21 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.8.1"
langchain = ">=0.0.333"
langsmith = "^0.0.64"
tqdm = "^4.66.1"

[tool.poetry.group.dev.dependencies]
jupyterlab = "^3.6.1"
Expand Down
12 changes: 12 additions & 0 deletions tests/unit_tests/test_public_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from langchain_benchmarks import __all__


def test_public_api() -> None:
"""Test that the public API is correct."""
# This test will also fail if __all__ is not sorted.
# Please keep it sorted!
assert __all__ == sorted(
[
"clone_dataset",
]
)
104 changes: 104 additions & 0 deletions tests/unit_tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import datetime
import unittest.mock as mock
import uuid
from contextlib import contextmanager
from typing import Any, Generator, List, Mapping, Optional, Sequence
from uuid import UUID

from langsmith.client import ID_TYPE
from langsmith.schemas import Dataset, Example
from langsmith.utils import LangSmithNotFoundError

from langchain_benchmarks.utils._langsmith import clone_dataset


# Define a mock Client class that overrides the required methods
class MockLangSmithClient:
def __init__(self) -> None:
"""Initialize the mock client."""
self.datasets = []
self.examples = []

def read_dataset(self, dataset_name: str) -> Dataset:
for dataset in self.datasets:
if dataset.name == dataset_name:
return dataset
raise LangSmithNotFoundError(f'Dataset "{dataset_name}" not found.')

def create_dataset(self, dataset_name: str) -> Dataset:
# Simulate creating a dataset and returning a mock Dataset object
dataset = Dataset(
id=UUID(int=3), name=dataset_name, created_at=datetime.datetime(2021, 1, 1)
)
self.datasets.append(dataset)
return dataset

def create_examples(
self,
*,
inputs: Sequence[Mapping[str, Any]],
outputs: Optional[Sequence[Optional[Mapping[str, Any]]]] = None,
dataset_id: Optional[ID_TYPE] = None,
dataset_name: Optional[str] = None,
max_concurrency: int = 10,
) -> None:
"""Create examples"""
examples = []
for idx, (input, output) in enumerate(zip(inputs, outputs)):
examples.append(
Example(
id=UUID(int=idx),
inputs=input,
outputs=output,
created_at=datetime.datetime(2021, 1, 1),
dataset_id=dataset_id,
dataset_name=dataset_name,
)
)

return self.examples.extend(examples)

def list_shared_examples(self, public_dataset_token: str) -> List[Example]:
# Simulate fetching shared examples and returning a list of Example objects
example1 = Example(
id=UUID(int=1),
inputs={"a": 1},
outputs={},
created_at=datetime.datetime(2021, 1, 1),
dataset_id=public_dataset_token,
)
example2 = Example(
id=UUID(int=2),
inputs={"b": 2},
outputs={},
created_at=datetime.datetime(2021, 1, 1),
dataset_id=public_dataset_token,
)
return [example1, example2]


@contextmanager
def mock_langsmith_client() -> Generator[None, None, None]:
"""Mock the langsmith Client class."""
from langchain_benchmarks.utils import _langsmith

mock_client = MockLangSmithClient()

with mock.patch.object(_langsmith, "Client") as client:
client.return_value = mock_client
yield mock_client


def test_clone_dataset() -> None:
# Call the clone_dataset function with mock data
public_dataset_token = str(uuid.UUID(int=3))
dataset_name = "my_dataset"

with mock_langsmith_client() as mock_client:
clone_dataset(public_dataset_token, dataset_name)
assert mock_client.datasets[0].name == dataset_name
assert len(mock_client.examples) == 2

# Check idempotency
clone_dataset(public_dataset_token, dataset_name)
assert len(mock_client.examples) == 2