From be682101d8ecc007710ff9a72fa56c284efd305f Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Wed, 10 Jul 2024 22:43:40 -0700
Subject: [PATCH] hub prompts

---
 scripts/tool_benchmarks.py | 202 ++++++++++++++++---------------------
 1 file changed, 89 insertions(+), 113 deletions(-)

diff --git a/scripts/tool_benchmarks.py b/scripts/tool_benchmarks.py
index 96f10a7..41e3523 100644
--- a/scripts/tool_benchmarks.py
+++ b/scripts/tool_benchmarks.py
@@ -1,6 +1,7 @@
 import datetime
 import uuid
 
+from langchain import hub
 from langchain_anthropic import ChatAnthropic
 from langchain_community.vectorstores import FAISS
 from langchain_core.example_selectors import SemanticSimilarityExampleSelector
@@ -12,7 +13,7 @@
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from langsmith.client import Client
 
-from langchain_benchmarks import __version__
+from langchain_benchmarks import __version__, registry
 from langchain_benchmarks.rate_limiting import RateLimiter
 from langchain_benchmarks.tool_usage.agents import StandardAgentFactory
 from langchain_benchmarks.tool_usage.tasks.multiverse_math import *
@@ -39,6 +40,7 @@
 
 client = Client()  # Launch langsmith client for cloning datasets
 
+
 experiment_uuid = uuid.uuid4().hex[:4]
 today = datetime.date.today().isoformat()
 for task in registry.tasks:
@@ -77,121 +79,95 @@
         )
         few_shot_messages += converted_messages
 
-few_shot_messages = [m for m in few_shot_messages if not isinstance(m, SystemMessage)]
+    few_shot_messages = [
+        m for m in few_shot_messages if not isinstance(m, SystemMessage)
+    ]
 
-few_shot_str = ""
-for m in few_shot_messages:
-    if isinstance(m.content, list):
-        few_shot_str += "AI message: "
-        for tool_use in m.content:
-            if "name" in tool_use:
-                few_shot_str += f"Use tool {tool_use['name']}, input: {', '.join(f'{k}:{v}' for k,v in tool_use['input'].items())}"
-            else:
-                few_shot_str += tool_use["text"]
-            few_shot_str += "\n"
-    else:
-        if isinstance(m, HumanMessage):
-            few_shot_str += f"Human message: {m.content}"
+    few_shot_str = ""
+    for m in few_shot_messages:
+        if isinstance(m.content, list):
+            few_shot_str += "AI message: "
+            for tool_use in m.content:
+                if "name" in tool_use:
+                    few_shot_str += f"Use tool {tool_use['name']}, input: {', '.join(f'{k}:{v}' for k,v in tool_use['input'].items())}"
+                else:
+                    few_shot_str += tool_use["text"]
+                few_shot_str += "\n"
         else:
-            few_shot_str += f"AI message: {m.content}"
-
-    few_shot_str += "\n"
-
-    example_selector = SemanticSimilarityExampleSelector.from_examples(
-        examples,
-        OpenAIEmbeddings(),
-        FAISS,
-        k=3,
-        input_keys=["question"],
-        example_keys=["messages"],
-    )
-
-    few_shot_prompt = FewShotChatMessagePromptTemplate(
-        input_variables=[],
-        example_selector=example_selector,
-        example_prompt=MessagesPlaceholder("messages"),
-    )
-
-    prompts = [
-        (
-            ChatPromptTemplate.from_messages(
-                [
-                    ("system", "{instructions}"),
-                    ("human", "{question}"),
-                    MessagesPlaceholder("agent_scratchpad"),  # Workspace for the agent
-                ]
+            if isinstance(m, HumanMessage):
+                few_shot_str += f"Human message: {m.content}"
+            else:
+                few_shot_str += f"AI message: {m.content}"
+
+        few_shot_str += "\n"
+
+        example_selector = SemanticSimilarityExampleSelector.from_examples(
+            examples,
+            OpenAIEmbeddings(),
+            FAISS,
+            k=3,
+            input_keys=["question"],
+            example_keys=["messages"],
+        )
+
+        few_shot_prompt = FewShotChatMessagePromptTemplate(
+            input_variables=[],
+            example_selector=example_selector,
+            example_prompt=MessagesPlaceholder("messages"),
+        )
+
+        prompts = [
+            (
+                hub.pull("multiverse-math-no-few-shot"),
+                "no-few-shot",
             ),
-            "no-few-shot",
-        ),
-        (
-            ChatPromptTemplate.from_messages(
-                [
-                    (
-                        "system",
-                        "{instructions} Here are some example conversations of the user interacting with the AI until the correct answer is reached: ",
-                    ),
-                ]
-                + few_shot_messages
-                + [
-                    ("human", "{question}"),
-                    MessagesPlaceholder("agent_scratchpad"),  # Workspace for the agent
-                ]
+            (
+                hub.pull("multiverse-math-few-shot-messages"),
+                "few-shot-messages",
             ),
-            "few-shot-message",
-        ),
-        (
-            ChatPromptTemplate.from_messages(
-                [
-                    (
-                        "system",
-                        "{instructions} Here are some example conversations of the user interacting with the AI until the correct answer is reached: "
-                        + few_shot_message,
-                    ),
-                    ("human", "{question}"),
-                    MessagesPlaceholder("agent_scratchpad"),  # Workspace for the agent
-                ]
+            (
+                hub.pull("multiverse-math-few-shot-str"),
+                "few-shot-string",
             ),
-            "few-shot-string",
-        ),
-        (
-            ChatPromptTemplate.from_messages(
-                [
-                    (
-                        "system",
-                        "{instructions} Here are some example conversations of the user interacting with the AI until the correct answer is reached: ",
-                    ),
-                    few_shot_prompt,
-                    ("human", "{question}"),
-                    MessagesPlaceholder("agent_scratchpad"),
-                ]
+            (
+                ChatPromptTemplate.from_messages(
+                    [
+                        (
+                            "system",
+                            "{instructions} Here are some example conversations of the user interacting with the AI until the correct answer is reached: ",
+                        ),
+                        few_shot_prompt,
+                        ("human", "{question}"),
+                        MessagesPlaceholder("agent_scratchpad"),
+                    ]
+                ),
+                "few-shot-semantic",
             ),
-            "few-shot-semantic",
-        ),
-    ]
-
-    for model_name, model in tests[:-1]:
-        rate_limiter = RateLimiter(requests_per_second=1)
-
-        print(f"Benchmarking {task.name} with model: {model_name}")
-        eval_config = task.get_eval_config()
-
-        for prompt, prompt_name in prompts:
-            agent_factory = StandardAgentFactory(
-                task, model, prompt, rate_limiter=rate_limiter
-            )
-
-            client.run_on_dataset(
-                dataset_name=dataset_name,
-                llm_or_chain_factory=agent_factory,
-                evaluation=eval_config,
-                verbose=False,
-                project_name=f"{model_name}-{task.name}-{prompt_name}-{experiment_uuid}",
-                concurrency_level=5,
-                project_metadata={
-                    "model": model_name,
-                    "id": experiment_uuid,
-                    "task": task.name,
-                    "date": today,
-                    "langchain_benchmarks_version": __version__,
-                },
-            )
+        ]
+
+        for model_name, model in tests[-2:-1]:
+            rate_limiter = RateLimiter(requests_per_second=1)
+
+            print(f"Benchmarking {task.name} with model: {model_name}")
+            eval_config = task.get_eval_config()
+
+            for prompt, prompt_name in prompts:
+                agent_factory = StandardAgentFactory(
+                    task, model, prompt, rate_limiter=rate_limiter
+                )
+
+                client.run_on_dataset(
+                    dataset_name=dataset_name,
+                    llm_or_chain_factory=agent_factory,
+                    evaluation=eval_config,
+                    verbose=False,
+                    project_name=f"{model_name}-{task.name}-{prompt_name}-{experiment_uuid}",
+                    concurrency_level=5,
+                    project_metadata={
+                        "model": model_name,
+                        "id": experiment_uuid,
+                        "task": task.name,
+                        "date": today,
+                        "langchain_benchmarks_version": __version__,
+                    },
+                )