From f4b9e9547500cd6f1082b67d80d098adc1e46708 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Thu, 18 Apr 2024 09:26:01 -0400 Subject: [PATCH 1/2] x --- .../tool_usage/tasks/multiverse_math.py | 37 +------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/langchain_benchmarks/tool_usage/tasks/multiverse_math.py b/langchain_benchmarks/tool_usage/tasks/multiverse_math.py index 671dbe1..f2861bf 100644 --- a/langchain_benchmarks/tool_usage/tasks/multiverse_math.py +++ b/langchain_benchmarks/tool_usage/tasks/multiverse_math.py @@ -128,7 +128,7 @@ def get_environment() -> ToolUsageEnvironment: # Source dataset used to create the public dataset in LangSmith -DATASET_TINY = [ +DATASET = [ { "question": "Add 2 and 3", "answer": add(2, 3), @@ -188,9 +188,6 @@ def get_environment() -> ToolUsageEnvironment: "answer": divide(multiply(15, pi()), 180), "expected_steps": ["pi", "multiply", "divide"], }, -] - -DATASET = DATASET_TINY + [ { "question": "evaluate negate(-131,778)", "answer": negate(-131_778), @@ -245,38 +242,6 @@ def get_environment() -> ToolUsageEnvironment: }, ] -MULTIVERSE_MATH_TINY = ToolUsageTask( - name="Multiverse Math (Tiny)", - dataset_id="https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d", - create_environment=get_environment, - instructions=( - "You are requested to solve math questions in an alternate " - "mathematical universe. The operations have been altered to yield " - "different results than expected. Do not guess the answer or rely on your " - " innate knowledge of math. Use the provided tools to answer the question. " - "While associativity and commutativity apply, distributivity does not. Answer " - "the question using the fewest possible tools. Only include the numeric " - "response without any clarifications." - ), - description=( - """\ -An environment that contains a few basic math operations, but with altered results. - -For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \ -The basic operations retain some basic properties, such as commutativity, \ -associativity, and distributivity; however, the results are different than expected. - -The objective of this task is to evaluate the ability to use the provided tools to \ -solve simple math questions and ignore any innate knowledge about math. - -This is a tiny version of the Multiverse Math task, with 10 examples only. -""" - ), - eval_params={ - "output_evaluation": "qa_math_without_question", - }, -) - MULTIVERSE_MATH = ToolUsageTask( name="Multiverse Math", dataset_id="https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d", From 28b0054c304a51eb60681b4552abe1edb4c1644c Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Thu, 18 Apr 2024 09:28:43 -0400 Subject: [PATCH 2/2] x --- langchain_benchmarks/registration.py | 1 - .../tool_usage/tasks/multiverse_math.py | 40 ++++++++++++++++++- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/langchain_benchmarks/registration.py b/langchain_benchmarks/registration.py index d700208..4b272e2 100644 --- a/langchain_benchmarks/registration.py +++ b/langchain_benchmarks/registration.py @@ -25,7 +25,6 @@ type_writer_26_funcs.TYPE_WRITER_26_FUNCS_TASK, relational_data.RELATIONAL_DATA_TASK, multiverse_math.MULTIVERSE_MATH, - multiverse_math.MULTIVERSE_MATH_TINY, email_task.EMAIL_EXTRACTION_TASK, chat_extraction.CHAT_EXTRACTION_TASK, LANGCHAIN_DOCS_TASK, diff --git a/langchain_benchmarks/tool_usage/tasks/multiverse_math.py b/langchain_benchmarks/tool_usage/tasks/multiverse_math.py index f2861bf..0beb8c2 100644 --- a/langchain_benchmarks/tool_usage/tasks/multiverse_math.py +++ b/langchain_benchmarks/tool_usage/tasks/multiverse_math.py @@ -128,7 +128,7 @@ def get_environment() -> ToolUsageEnvironment: # Source dataset used to create the public dataset in LangSmith -DATASET = [ +DATASET_TINY = [ { "question": "Add 2 and 3", "answer": add(2, 3), @@ -188,6 +188,9 @@ def get_environment() -> ToolUsageEnvironment: "answer": divide(multiply(15, pi()), 180), "expected_steps": ["pi", "multiply", "divide"], }, +] + +DATASET = DATASET_TINY + [ { "question": "evaluate negate(-131,778)", "answer": negate(-131_778), @@ -242,6 +245,41 @@ def get_environment() -> ToolUsageEnvironment: }, ] +# Provided here for backwards compatibility, but we do not register +# it as a task in the task registry. +# TINY is just the multiverse math task with 10 examples instead of full dataset. +MULTIVERSE_MATH_TINY = ToolUsageTask( + name="Multiverse Math (Tiny)", + dataset_id="https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d", + create_environment=get_environment, + instructions=( + "You are requested to solve math questions in an alternate " + "mathematical universe. The operations have been altered to yield " + "different results than expected. Do not guess the answer or rely on your " + " innate knowledge of math. Use the provided tools to answer the question. " + "While associativity and commutativity apply, distributivity does not. Answer " + "the question using the fewest possible tools. Only include the numeric " + "response without any clarifications." + ), + description=( + """\ +An environment that contains a few basic math operations, but with altered results. + +For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \ +The basic operations retain some basic properties, such as commutativity, \ +associativity, and distributivity; however, the results are different than expected. + +The objective of this task is to evaluate the ability to use the provided tools to \ +solve simple math questions and ignore any innate knowledge about math. + +This is a tiny version of the Multiverse Math task, with 10 examples only. +""" + ), + eval_params={ + "output_evaluation": "qa_math_without_question", + }, +) + MULTIVERSE_MATH = ToolUsageTask( name="Multiverse Math", dataset_id="https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d",