Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add clone/download dataset, handle web vs. api paths #20

Merged
merged 2 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions langchain_benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from .utils._langsmith import clone_dataset
from langchain_benchmarks.utils._langsmith import (
clone_public_dataset,
download_public_dataset,
)

# Please keep this list sorted!
__all__ = ["clone_dataset"]
__all__ = ["clone_public_dataset", "download_public_dataset"]
134 changes: 101 additions & 33 deletions langchain_benchmarks/utils/_langsmith.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,122 @@
"""Copy the public dataset to your own langsmith tenant."""
import json
import urllib.parse
from pathlib import Path
from typing import Union, Optional, Tuple
from uuid import UUID

from langsmith import Client
from langsmith.utils import LangSmithNotFoundError
from tqdm import tqdm
from tqdm import auto

WEB_API_URL = "https://web.smith.langchain.com/"


def _parse_token_or_url(url_or_token: str, api_url: str) -> Tuple[str, Optional[str]]:
"""Parse a public dataset URL or share token."""
try:
UUID(url_or_token)
return api_url, url_or_token
except ValueError:
pass

# Then it's a URL
parsed_url = urllib.parse.urlparse(url_or_token)
# Extract the UUID from the path
path_parts = parsed_url.path.split("/")
uuid = path_parts[-2] if len(path_parts) >= 2 else None
return WEB_API_URL, uuid


# PUBLIC API


def clone_dataset(
public_dataset_token: str,
dataset_name: str,
def clone_public_dataset(
token_or_url: str,
*,
dataset_name: Optional[str] = None,
source_api_url: str = WEB_API_URL,
) -> None:
"""Clone a public dataset to your own langsmith tenant.

This operation is idempotent. If you already have a dataset with the given name,
this function will do nothing.

Args:
public_dataset_token (str): The token of the public dataset to clone.
token_or_url (str): The token of the public dataset to clone.
dataset_name (str): The name of the dataset to create in your tenant.
source_api_url: The URL of the langsmith server where the data is hosted:w
"""
client = Client()

if dataset_name is None:
raise NotImplementedError(
"Automatic dataset name generation is not implemented yet"
)
client = Client() # Client used to write to langsmith
try:
client.read_dataset(dataset_name=dataset_name)
dataset = client.read_dataset(dataset_name=dataset_name)

if dataset:
print(f"Dataset {dataset_name} already exists. Skipping.")
print(f"You can access the dataset at {dataset.url}.")
return
except LangSmithNotFoundError:
pass
else:
print(f"Dataset {dataset_name} already exists. Skipping.")
return

# Fetch examples first
examples = tqdm(list(client.list_shared_examples(public_dataset_token)))
print("Finished fetching examples. Creating dataset...")
dataset = client.create_dataset(dataset_name=dataset_name)

source_api_url, uuid = _parse_token_or_url(token_or_url, source_api_url)
source_client = Client(api_url=source_api_url, api_key="placeholder")
try:
client.create_examples(
inputs=[e.inputs for e in examples],
outputs=[e.outputs for e in examples],
dataset_id=dataset.id,
)
except BaseException as e:
# Let's not do automatic clean up for now in case there might be
# some other reasons why create_examples fails (i.e., not network issue or
# keyboard interrupt).
# The risk is that this is an existing dataset that has valid examples
# populated from another source so we don't want to delete it.
print(
f"An error occurred while creating dataset {dataset_name}. "
"You should delete it manually."
)
raise e
# Fetch examples first
examples = auto.tqdm(list(source_client.list_shared_examples(uuid)))
print("Finished fetching examples. Creating dataset...")
dataset = client.create_dataset(dataset_name=dataset_name)
print(f"New dataset created you can access it at {dataset.url}.")
try:
client.create_examples(
inputs=[e.inputs for e in examples],
outputs=[e.outputs for e in examples],
dataset_id=dataset.id,
)
except BaseException as e:
# Let's not do automatic clean up for now in case there might be
# some other reasons why create_examples fails (i.e., not network issue or
# keyboard interrupt).
# The risk is that this is an existing dataset that has valid examples
# populated from another source so we don't want to delete it.
print(
f"An error occurred while creating dataset {dataset_name}. "
"You should delete it manually."
)
raise e

print("Done creating dataset.")
finally:
del source_client
del client

print("Done creating dataset.")

def download_public_dataset(
token_or_url: str,
*,
path: Optional[Union[str, Path]] = None,
api_url: str = WEB_API_URL,
) -> None:
"""Download a public dataset."""
api_url, uuid = _parse_token_or_url(token_or_url, api_url)
_path = str(path) if path else f"{uuid}.json"
if not _path.endswith(".json"):
raise ValueError(f"Path must end with .json got: {_path}")

# This the client where the source data lives
# The destination for the dataset is the local filesystem
source_client = Client(api_url=api_url, api_key="placeholder")

try:
# Fetch examples first
print("Fetching examples...")
examples = auto.tqdm(list(source_client.list_shared_examples(uuid)))
with open(str(_path), mode="w", encoding="utf-8") as f:
jsonifable_examples = [json.loads(example.json()) for example in examples]
json.dump(jsonifable_examples, f, indent=2)
print("Done fetching examples.")
finally:
del source_client
45 changes: 44 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain-benchmarks"
version = "0.1.0"
version = "0.0.1"
description = "Flex them feathers! 🦜💪"
authors = ["LangChain AI"]
license = "MIT"
Expand All @@ -11,6 +11,7 @@ python = "^3.8.1"
langchain = ">=0.0.333"
langsmith = "^0.0.64"
tqdm = "^4.66.1"
ipywidgets = "^8.1.1"

[tool.poetry.group.dev.dependencies]
jupyterlab = "^3.6.1"
Expand Down
6 changes: 1 addition & 5 deletions tests/unit_tests/test_public_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,4 @@ def test_public_api() -> None:
"""Test that the public API is correct."""
# This test will also fail if __all__ is not sorted.
# Please keep it sorted!
assert __all__ == sorted(
[
"clone_dataset",
]
)
assert __all__ == sorted(["clone_public_dataset", "download_public_dataset"])
6 changes: 3 additions & 3 deletions tests/unit_tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from langsmith.schemas import Dataset, Example
from langsmith.utils import LangSmithNotFoundError

from langchain_benchmarks.utils._langsmith import clone_dataset
from langchain_benchmarks.utils._langsmith import clone_public_dataset


# Define a mock Client class that overrides the required methods
Expand Down Expand Up @@ -95,10 +95,10 @@ def test_clone_dataset() -> None:
dataset_name = "my_dataset"

with mock_langsmith_client() as mock_client:
clone_dataset(public_dataset_token, dataset_name)
clone_public_dataset(public_dataset_token, dataset_name=dataset_name)
assert mock_client.datasets[0].name == dataset_name
assert len(mock_client.examples) == 2

# Check idempotency
clone_dataset(public_dataset_token, dataset_name)
clone_public_dataset(public_dataset_token, dataset_name=dataset_name)
assert len(mock_client.examples) == 2