first draft

langchain-ai · Jul 3, 2024 · 3aae5da · 3aae5da
1 parent 22d279a
commit 3aae5da
Show file tree

Hide file tree

Showing 2 changed files with 121 additions and 0 deletions.
diff --git a/.github/workflows/_benchmarks.yml b/.github/workflows/_benchmarks.yml
@@ -0,0 +1,30 @@
+name: Weekly Tool Benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * 0'  # Runs at midnight (00:00) every Sunday (UTC time)
+
+jobs:
+  run_tool_benchmarks:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.12 + Poetry ${{ env.POETRY_VERSION }}
+        uses: "./.github/actions/poetry_setup"
+        with:
+          python-version: '3.12'
+          poetry-version: ${{ env.POETRY_VERSION }}
+          working-directory: .
+          cache-key: benchmarks-all
+
+      - name: Install dependencies
+        shell: bash
+        run: |
+          echo "Running tests, installing dependencies with poetry..."
+          poetry install --with test,lint,typing,docs
+
+      - name: Execute Tool Benchmarks
+        run: python scripts/tool_benchmarks.py
diff --git a/scripts/tool_benchmarks.py b/scripts/tool_benchmarks.py
@@ -0,0 +1,91 @@
+import uuid
+from langchain_anthropic import ChatAnthropic
+from langchain_core.tools import tool
+from langchain_openai import ChatOpenAI
+from langchain_fireworks import ChatFireworks
+
+from langchain_benchmarks.tool_usage.agents import StandardAgentFactory
+
+import datetime
+
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langsmith.client import Client
+from langchain_core.messages import HumanMessage, SystemMessage
+
+from langchain_benchmarks import (
+    __version__,
+    registry,
+)
+from langchain_benchmarks.rate_limiting import RateLimiter
+
+tests = [
+    (
+        "claude-3-haiku-20240307",
+        ChatAnthropic(model="claude-3-haiku-20240307", temperature=0),
+    ),
+    (
+        "claude-3-sonnet-20240229",
+        ChatAnthropic(model="claude-3-sonnet-20240229", temperature=0),
+    ),
+    (
+        "gpt-3.5-turbo-0125",
+        ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)),
+    (
+        "gpt-4-turbo-2024-04-09",
+        ChatOpenAI(model="gpt-4-turbo-2024-04-09", temperature=0),
+    ),
+    (
+        "accounts/fireworks/models/firefunction-v2",
+        ChatFireworks(model="accounts/fireworks/models/firefunction-v2", temperature=0)
+    )
+]
+
+client = Client()  # Launch langsmith client for cloning datasets
+today = datetime.date.today().isoformat()
+
+experiment_uuid = uuid.uuid4().hex[:4]
+
+for task in registry.tasks:
+    if task.type != "ToolUsageTask":
+        continue
+
+    # This is a small test dataset that can be used to verify
+    # that everything is set up correctly prior to running over
+    # all results. We may remove it in the future.
+    if task.name == "Multiverse Math (Tiny)":
+        continue
+
+    dataset_name = task.name
+
+    for model_name, model in tests:
+        prompt = ChatPromptTemplate.from_messages(
+            [
+                ("system", "{instructions}"),
+                ("human", "{question}"),  # Populated from task.instructions automatically
+                MessagesPlaceholder("agent_scratchpad"),  # Workspace for the agent
+            ]
+        )
+        rate_limiter = RateLimiter(requests_per_second=1)
+
+        print(f"Benchmarking {task.name} with model: {model_name}")
+        eval_config = task.get_eval_config()
+
+        agent_factory = StandardAgentFactory(
+            task, model, prompt, rate_limiter=rate_limiter
+        )
+
+        client.run_on_dataset(
+            dataset_name=dataset_name,
+            llm_or_chain_factory=agent_factory,
+            evaluation=eval_config,
+            verbose=False,
+            project_name=f"{model_name}-{task.name}-{today}-{experiment_uuid}",
+            concurrency_level=5,
+            project_metadata={
+                "model": model_name,
+                "id": experiment_uuid,
+                "task": task.name,
+                "date": today,
+                "langchain_benchmarks_version": __version__,
+            },
+        )