diff --git a/langchain_benchmarks/registration.py b/langchain_benchmarks/registration.py index 7f10a23..3b12e60 100644 --- a/langchain_benchmarks/registration.py +++ b/langchain_benchmarks/registration.py @@ -20,6 +20,7 @@ type_writer.TYPE_WRITER_TASK, type_writer_26_funcs.TYPE_WRITER_26_FUNCS_TASK, relational_data.RELATIONAL_DATA_TASK, + multiverse_math.MULTIVERSE_MATH, multiverse_math.MULTIVERSE_MATH_TINY, email_task.EMAIL_EXTRACTION_TASK, chat_extraction.CHAT_EXTRACTION_TASK, diff --git a/langchain_benchmarks/tool_usage/evaluators.py b/langchain_benchmarks/tool_usage/evaluators.py index d2c98e5..9d6e89e 100644 --- a/langchain_benchmarks/tool_usage/evaluators.py +++ b/langchain_benchmarks/tool_usage/evaluators.py @@ -5,17 +5,13 @@ * Agents must output "intermediate_steps" in their run outputs. * The dataset must have "expected_steps" in its outputs. """ -from typing import Literal, Optional, Union - import re -from typing import Any, Optional +from typing import Any, Literal, Optional, Union -from langchain.chains import LLMChain -from langchain.chat_models import ChatOpenAI -from langchain.evaluation import StringEvaluator from langchain.callbacks.manager import collect_runs +from langchain.chains import LLMChain from langchain.chat_models import ChatOpenAI -from langchain.evaluation import EvaluatorType, load_evaluator +from langchain.evaluation import EvaluatorType, StringEvaluator, load_evaluator from langchain.evaluation.schema import StringEvaluator from langchain.smith import RunEvalConfig from langchain_core.language_models import BaseChatModel, BaseLanguageModel diff --git a/langchain_benchmarks/tool_usage/tasks/multiverse_math.py b/langchain_benchmarks/tool_usage/tasks/multiverse_math.py index 273e24c..671dbe1 100644 --- a/langchain_benchmarks/tool_usage/tasks/multiverse_math.py +++ b/langchain_benchmarks/tool_usage/tasks/multiverse_math.py @@ -127,36 +127,6 @@ def get_environment() -> ToolUsageEnvironment: ) -MULTIVERSE_MATH_TINY = ToolUsageTask( - name="Multiverse Math (Tiny)", - dataset_id="https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d", - create_environment=get_environment, - instructions=( - "You are requested to solve math questions in an alternate " - "mathematical universe. The operations have been altered to yield " - "different results than expected. Do not guess the answer or rely on your " - " innate knowledge of math. Use the provided tools to answer the question. " - "While associativity and commutativity apply, distributivity does not. Answer " - "the question using the fewest possible tools. Only include the numeric " - "response without any clarifications." - ), - description=( - """\ -An environment that contains a few basic math operations, but with altered results. - -For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \ -The basic operations retain some basic properties, such as commutativity, \ -associativity, and distributivity; however, the results are different than expected. - -The objective of this task is to evaluate the ability to use the provided tools to \ -solve simple math questions and ignore any innate knowledge about math. -""" - ), - eval_params={ - "output_evaluation": "qa_math_without_question", - }, -) - # Source dataset used to create the public dataset in LangSmith DATASET_TINY = [ { @@ -275,6 +245,70 @@ def get_environment() -> ToolUsageEnvironment: }, ] +MULTIVERSE_MATH_TINY = ToolUsageTask( + name="Multiverse Math (Tiny)", + dataset_id="https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d", + create_environment=get_environment, + instructions=( + "You are requested to solve math questions in an alternate " + "mathematical universe. The operations have been altered to yield " + "different results than expected. Do not guess the answer or rely on your " + " innate knowledge of math. Use the provided tools to answer the question. " + "While associativity and commutativity apply, distributivity does not. Answer " + "the question using the fewest possible tools. Only include the numeric " + "response without any clarifications." + ), + description=( + """\ +An environment that contains a few basic math operations, but with altered results. + +For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \ +The basic operations retain some basic properties, such as commutativity, \ +associativity, and distributivity; however, the results are different than expected. + +The objective of this task is to evaluate the ability to use the provided tools to \ +solve simple math questions and ignore any innate knowledge about math. + +This is a tiny version of the Multiverse Math task, with 10 examples only. +""" + ), + eval_params={ + "output_evaluation": "qa_math_without_question", + }, +) + +MULTIVERSE_MATH = ToolUsageTask( + name="Multiverse Math", + dataset_id="https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d", + create_environment=get_environment, + instructions=( + "You are requested to solve math questions in an alternate " + "mathematical universe. The operations have been altered to yield " + "different results than expected. Do not guess the answer or rely on your " + " innate knowledge of math. Use the provided tools to answer the question. " + "While associativity and commutativity apply, distributivity does not. Answer " + "the question using the fewest possible tools. Only include the numeric " + "response without any clarifications." + ), + description=( + """\ +An environment that contains a few basic math operations, but with altered results. + +For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \ +The basic operations retain some basic properties, such as commutativity, \ +associativity, and distributivity; however, the results are different than expected. + +The objective of this task is to evaluate the ability to use the provided tools to \ +solve simple math questions and ignore any innate knowledge about math. + +This task is associated with 20 test examples. +""" + ), + eval_params={ + "output_evaluation": "qa_math_without_question", + }, +) + def _create_dataset() -> None: """Create a dataset with the langsmith client.""" @@ -283,11 +317,11 @@ def _create_dataset() -> None: client = Client() dataset = client.create_dataset( - dataset_name=MULTIVERSE_MATH_TINY.name, - description=MULTIVERSE_MATH_TINY.description, + dataset_name=MULTIVERSE_MATH.name, + description=MULTIVERSE_MATH.description, ) - for example in DATASET_TINY: + for example in DATASET: client.create_example( inputs={ "question": example["question"],