langchain-ai · eyurtsev · Nov 16, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/langchain_benchmarks/__init__.py b/langchain_benchmarks/__init__.py
@@ -1,4 +1,7 @@
-from .utils._langsmith import clone_dataset
+from langchain_benchmarks.utils._langsmith import (
+    clone_public_dataset,
+    download_public_dataset,
+)
 
 # Please keep this list sorted!
-__all__ = ["clone_dataset"]
+__all__ = ["clone_public_dataset", "download_public_dataset"]
diff --git a/langchain_benchmarks/utils/_langsmith.py b/langchain_benchmarks/utils/_langsmith.py
@@ -1,54 +1,122 @@
 """Copy the public dataset to your own langsmith tenant."""
+import json
+import urllib.parse
+from pathlib import Path
+from typing import Union, Optional, Tuple
+from uuid import UUID
+
 from langsmith import Client
 from langsmith.utils import LangSmithNotFoundError
-from tqdm import tqdm
+from tqdm import auto
+
+WEB_API_URL = "https://web.smith.langchain.com/"
+
+
+def _parse_token_or_url(url_or_token: str, api_url: str) -> Tuple[str, Optional[str]]:
+    """Parse a public dataset URL or share token."""
+    try:
+        UUID(url_or_token)
+        return api_url, url_or_token
+    except ValueError:
+        pass
+
+    # Then it's a URL
+    parsed_url = urllib.parse.urlparse(url_or_token)
+    # Extract the UUID from the path
+    path_parts = parsed_url.path.split("/")
+    uuid = path_parts[-2] if len(path_parts) >= 2 else None
+    return WEB_API_URL, uuid
+
 
 # PUBLIC API
 
 
-def clone_dataset(
-    public_dataset_token: str,
-    dataset_name: str,
+def clone_public_dataset(
+    token_or_url: str,
+    *,
+    dataset_name: Optional[str] = None,
+    source_api_url: str = WEB_API_URL,
 ) -> None:
     """Clone a public dataset to your own langsmith tenant.
 
     This operation is idempotent. If you already have a dataset with the given name,
     this function will do nothing.
 
     Args:
-        public_dataset_token (str): The token of the public dataset to clone.
+        token_or_url (str): The token of the public dataset to clone.
         dataset_name (str): The name of the dataset to create in your tenant.
+        source_api_url: The URL of the langsmith server where the data is hosted:w
     """
-    client = Client()
-
+    if dataset_name is None:
+        raise NotImplementedError(
+            "Automatic dataset name generation is not implemented yet"
+        )
+    client = Client()  # Client used to write to langsmith
     try:
-        client.read_dataset(dataset_name=dataset_name)
+        dataset = client.read_dataset(dataset_name=dataset_name)
+
+        if dataset:
+            print(f"Dataset {dataset_name} already exists. Skipping.")
+            print(f"You can access the dataset at {dataset.url}.")
+            return
     except LangSmithNotFoundError:
         pass
-    else:
-        print(f"Dataset {dataset_name} already exists. Skipping.")
-        return
-
-    # Fetch examples first
-    examples = tqdm(list(client.list_shared_examples(public_dataset_token)))
-    print("Finished fetching examples. Creating dataset...")
-    dataset = client.create_dataset(dataset_name=dataset_name)
+
+    source_api_url, uuid = _parse_token_or_url(token_or_url, source_api_url)
+    source_client = Client(api_url=source_api_url, api_key="placeholder")
     try:
-        client.create_examples(
-            inputs=[e.inputs for e in examples],
-            outputs=[e.outputs for e in examples],
-            dataset_id=dataset.id,
-        )
-    except BaseException as e:
-        # Let's not do automatic clean up for now in case there might be
-        # some other reasons why create_examples fails (i.e., not network issue or
-        # keyboard interrupt).
-        # The risk is that this is an existing dataset that has valid examples
-        # populated from another source so we don't want to delete it.
-        print(
-            f"An error occurred while creating dataset {dataset_name}. "
-            "You should delete it manually."
-        )
-        raise e
+        # Fetch examples first
+        examples = auto.tqdm(list(source_client.list_shared_examples(uuid)))
+        print("Finished fetching examples. Creating dataset...")
+        dataset = client.create_dataset(dataset_name=dataset_name)
+        print(f"New dataset created you can access it at {dataset.url}.")
+        try:
+            client.create_examples(
+                inputs=[e.inputs for e in examples],
+                outputs=[e.outputs for e in examples],
+                dataset_id=dataset.id,
+            )
+        except BaseException as e:
+            # Let's not do automatic clean up for now in case there might be
+            # some other reasons why create_examples fails (i.e., not network issue or
+            # keyboard interrupt).
+            # The risk is that this is an existing dataset that has valid examples
+            # populated from another source so we don't want to delete it.
+            print(
+                f"An error occurred while creating dataset {dataset_name}. "
+                "You should delete it manually."
+            )
+            raise e
+
+        print("Done creating dataset.")
+    finally:
+        del source_client
+        del client
 
-    print("Done creating dataset.")
+
+def download_public_dataset(
+    token_or_url: str,
+    *,
+    path: Optional[Union[str, Path]] = None,
+    api_url: str = WEB_API_URL,
+) -> None:
+    """Download a public dataset."""
+    api_url, uuid = _parse_token_or_url(token_or_url, api_url)
+    _path = str(path) if path else f"{uuid}.json"
+    if not _path.endswith(".json"):
+        raise ValueError(f"Path must end with .json got: {_path}")
+
+    # This the client where the source data lives
+    # The destination for the dataset is the local filesystem
+    source_client = Client(api_url=api_url, api_key="placeholder")
+
+    try:
+        # Fetch examples first
+        print("Fetching examples...")
+        examples = auto.tqdm(list(source_client.list_shared_examples(uuid)))
+        with open(str(_path), mode="w", encoding="utf-8") as f:
+            jsonifable_examples = [json.loads(example.json()) for example in examples]
+            json.dump(jsonifable_examples, f, indent=2)
+        print("Done fetching examples.")
+    finally:
+        del source_client
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langchain-benchmarks"
-version = "0.1.0"
+version = "0.0.1"
 description = "Flex them feathers! 🦜💪"
 authors = ["LangChain AI"]
 license = "MIT"
@@ -11,6 +11,7 @@ python = "^3.8.1"
 langchain = ">=0.0.333"
 langsmith = "^0.0.64"
 tqdm = "^4.66.1"
+ipywidgets = "^8.1.1"
 
 [tool.poetry.group.dev.dependencies]
 jupyterlab = "^3.6.1"

diff --git a/tests/unit_tests/test_public_api.py b/tests/unit_tests/test_public_api.py
@@ -5,8 +5,4 @@ def test_public_api() -> None:
     """Test that the public API is correct."""
     # This test will also fail if __all__ is not sorted.
     # Please keep it sorted!
-    assert __all__ == sorted(
-        [
-            "clone_dataset",
-        ]
-    )
+    assert __all__ == sorted(["clone_public_dataset", "download_public_dataset"])
diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py
@@ -9,7 +9,7 @@
 from langsmith.schemas import Dataset, Example
 from langsmith.utils import LangSmithNotFoundError
 
-from langchain_benchmarks.utils._langsmith import clone_dataset
+from langchain_benchmarks.utils._langsmith import clone_public_dataset
 
 
 # Define a mock Client class that overrides the required methods
@@ -95,10 +95,10 @@ def test_clone_dataset() -> None:
     dataset_name = "my_dataset"
 
     with mock_langsmith_client() as mock_client:
-        clone_dataset(public_dataset_token, dataset_name)
+        clone_public_dataset(public_dataset_token, dataset_name=dataset_name)
         assert mock_client.datasets[0].name == dataset_name
         assert len(mock_client.examples) == 2
 
         # Check idempotency
-        clone_dataset(public_dataset_token, dataset_name)
+        clone_public_dataset(public_dataset_token, dataset_name=dataset_name)
         assert len(mock_client.examples) == 2