langchain-ai · eyurtsev · Nov 29, 2023 · Nov 28, 2023 · Nov 29, 2023 · Nov 29, 2023
diff --git a/docs/source/notebooks/tool_usage/multiverse_math.ipynb b/docs/source/notebooks/tool_usage/multiverse_math.ipynb
diff --git a/docs/source/notebooks/tool_usage/relational_data.ipynb b/docs/source/notebooks/tool_usage/relational_data.ipynb
diff --git a/docs/source/notebooks/tool_usage/typewriter_1.ipynb b/docs/source/notebooks/tool_usage/typewriter_1.ipynb
diff --git a/docs/source/notebooks/tool_usage/typewriter_26.ipynb b/docs/source/notebooks/tool_usage/typewriter_26.ipynb
diff --git a/langchain_benchmarks/tool_usage/agents.py b/langchain_benchmarks/tool_usage/agents.py
@@ -121,6 +121,7 @@ def _read_state(*args: Any, **kwargs: Any) -> Any:
 
     def _format_input(inputs: dict) -> dict:
         """Make sure that the input is always called `input`."""
+
         if "question" not in inputs:
             raise ValueError(
                 "Expected 'question' to be in the inputs. Found only the following "
@@ -142,7 +143,7 @@ def _format_input(inputs: dict) -> dict:
     )
 
     if state_reader is not None:
-        runnable = agent_executor | RunnablePassthrough.assign(
-            state=_read_state
-        ).with_config({"run_name": "Read Env State"})
+        runnable = runnable | RunnablePassthrough.assign(state=_read_state).with_config(
+            {"run_name": "Read Env State"}
+        )
     return runnable
diff --git a/langchain_benchmarks/tool_usage/evaluators.py b/langchain_benchmarks/tool_usage/evaluators.py
@@ -5,20 +5,23 @@
 * Agents must output "intermediate_steps" in their run outputs.
 * The dataset must have "expected_steps" in its outputs.
 """
-from typing import Optional
+from typing import Literal, Optional, Union
 
 from langchain.callbacks.manager import collect_runs
 from langchain.chat_models import ChatOpenAI
 from langchain.evaluation import EvaluatorType, load_evaluator
 from langchain.evaluation.schema import StringEvaluator
 from langchain.smith import RunEvalConfig
+from langchain_core.language_models import BaseChatModel, BaseLanguageModel
 from langsmith.evaluation.evaluator import (
     EvaluationResult,
     EvaluationResults,
     RunEvaluator,
 )
 from langsmith.schemas import Example, Run
 
+from langchain_benchmarks.tool_usage.prompts import QA_TEMPLATE_FOR_MULTIVERSE_MATH
+
 
 def compare_outputs(
     run_outputs: dict,
@@ -97,16 +100,41 @@ def compare_outputs(
 class AgentTrajectoryEvaluator(RunEvaluator):
     """An evaluator that can be used in conjunction with a standard agent interface."""
 
-    def __init__(self) -> None:
+    def __init__(
+        self,
+        eval_llm: Union[BaseLanguageModel, BaseChatModel, None] = None,
+        output_evaluation: Literal["qa", "none"] = "qa",
+    ) -> None:
         """Initialize the evaluator."""
-        eval_llm = ChatOpenAI(
-            model="gpt-4",
-            temperature=0,
-            model_kwargs={"seed": 42},
-            max_retries=1,
-            request_timeout=60,
-        )
-        self.qa_evaluator = load_evaluator(EvaluatorType.QA, llm=eval_llm)
+        if output_evaluation == "none":
+            if eval_llm is not None:
+                raise ValueError(
+                    "If output_evaluation is 'none', then eval_llm must be None"
+                )
+            qa_evaluator = None
+        else:
+            eval_llm = eval_llm or ChatOpenAI(
+                model="gpt-4",
+                temperature=0,
+                model_kwargs={"seed": 42},
+                max_retries=1,
+                request_timeout=60,
+            )
+            if output_evaluation == "qa":
+                qa_evaluator = load_evaluator(EvaluatorType.QA, llm=eval_llm)
+            elif output_evaluation == "qa_math":
+                qa_evaluator = load_evaluator(
+                    EvaluatorType.QA,
+                    llm=eval_llm,
+                    prompt=QA_TEMPLATE_FOR_MULTIVERSE_MATH,
+                )
+            else:
+                raise ValueError(
+                    f"output_evaluation must be one of 'qa' or 'none', "
+                    f"got {output_evaluation}"
+                )
+
+        self.qa_evaluator = qa_evaluator
 
     def evaluate_run(
         self, run: Run, example: Optional[Example] = None
@@ -137,6 +165,30 @@ def evaluate_run(
         )
 
 
-def get_eval_config() -> RunEvalConfig:
-    """Returns the default evaluator for the environment."""
-    return RunEvalConfig(custom_evaluators=[AgentTrajectoryEvaluator()])
+def get_eval_config(
+    *,
+    eval_llm: Union[BaseLanguageModel, BaseChatModel, None] = None,
+    output_evaluation: Literal["qa", "qa_math", "none"] = "qa",
+) -> RunEvalConfig:
+    """Get the default evaluator for the environment.
+
+    Args:
+        eval_llm: The language model to use for grading the `output` response
+        output_evaluation: how to evaluate the output of the agent.
+            - 'qa' will use the qa evaluator to compare the output to the reference.
+            - 'qa_math' will use the qa evaluator to compare the output to the reference
+               using a prompt that better works for multiverse math.
+            - 'none' will not evaluate the output of the agent -- in some cases
+              it's only relevant to evaluate how the agent used tools, not what
+              its output.
+
+    Returns:
+        A RunEvalConfig that can be used to evaluate the environment
+    """
+    return RunEvalConfig(
+        custom_evaluators=[
+            AgentTrajectoryEvaluator(
+                eval_llm=eval_llm, output_evaluation=output_evaluation
+            )
+        ]
+    )
diff --git a/langchain_benchmarks/tool_usage/prompts.py b/langchain_benchmarks/tool_usage/prompts.py
@@ -0,0 +1,24 @@
+from langchain_core.prompts import PromptTemplate
+
+MATH_TEMPLATE = """\
+You live in an alternate universe. Do not assume that you know anything.
+You are a teacher grading a quiz.
+You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either CORRECT or INCORRECT.
+
+Example Format:
+QUESTION: question here
+STUDENT ANSWER: student's answer here
+TRUE ANSWER: true answer here
+GRADE: CORRECT or INCORRECT here
+
+Given that you live in an alternate universe the TRUE answer may be different from what you expect. That's OK!
+
+Grade the student answers based ONLY on whether it matches the TRUE answer. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! 
+
+QUESTION: {query}
+STUDENT ANSWER: {result}
+TRUE ANSWER: {answer}
+GRADE:"""
+QA_TEMPLATE_FOR_MULTIVERSE_MATH = PromptTemplate(
+    input_variables=["result", "answer"], template=MATH_TEMPLATE
+)
diff --git a/langchain_benchmarks/tool_usage/tasks/multiverse_math.py b/langchain_benchmarks/tool_usage/tasks/multiverse_math.py
@@ -190,8 +190,8 @@ def get_environment() -> ToolUsageEnvironment:
         "expected_steps": ["log", "multiply"],
     },
     {
-        "question": "calculate sqrt of 101 to 4 digits of precision",
-        "answer": round(power(101, 0.4), 4),
+        "question": "calculate 101 to the power of 0.5 to 4 digits of precision",
+        "answer": round(power(101, 0.5), 4),
         "expected_steps": ["power", "round"],
     },
     {
@@ -207,7 +207,7 @@ def get_environment() -> ToolUsageEnvironment:
             "after calculating the sin of 1.5 radians, divide "
             "the result by cos of 1.5 radians"
         ),
-        "answer": sin(1.5) / cos(1.5),
+        "answer": divide(sin(1.5), cos(1.5)),
         "expected_steps": ["sin", "cos", "divide"],
     },
     {

diff --git a/poetry.lock b/poetry.lock