langchain-ai · hinthornw · Dec 11, 2023 · Dec 11, 2023
diff --git a/langchain_benchmarks/schema.py b/langchain_benchmarks/schema.py
@@ -10,6 +10,7 @@
 from langchain.schema import BaseRetriever
 from langchain.schema.document import Document
 from langchain.schema.embeddings import Embeddings
+from langchain.smith import RunEvalConfig
 from langchain.tools import BaseTool
 from langchain_core.language_models import BaseChatModel, BaseLanguageModel
 from pydantic import BaseModel
@@ -93,6 +94,27 @@ class ToolUsageTask(BaseTask):
     instructions: str
     """Instructions for the agent/chain/llm."""
 
+    eval_params: Dict[str, Any]
+    """Used to parameterize differences in the evaluation of the task.
+
+    These are passed to the standard factory method for creating an evaluator
+    for tool usage.
+
+    An example, for MultiVerse math the `output_evaluation` parameter is set to
+    `qa_math` to use a different prompt for evaluating the output of the agent.
+
+    This prompt performs better at comparing the output of the agent against
+    the reference output.
+    """
+
+    def get_eval_config(self, **params: Any) -> RunEvalConfig:
+        """Get the default evaluator for the environment."""
+        # Import locally to avoid potential circular imports in the future.
+        from langchain_benchmarks.tool_usage.evaluators import get_eval_config
+
+        finalized_params = {**self.eval_params, **params}
+        return get_eval_config(**finalized_params)
+
 
 @dataclasses.dataclass(frozen=True)
 class ExtractionTask(BaseTask):

diff --git a/langchain_benchmarks/tool_usage/evaluators.py b/langchain_benchmarks/tool_usage/evaluators.py
@@ -112,7 +112,7 @@ class AgentTrajectoryEvaluator(RunEvaluator):
     def __init__(
         self,
         eval_llm: Union[BaseLanguageModel, BaseChatModel, None] = None,
-        output_evaluation: Literal["qa", "none"] = "qa",
+        output_evaluation: Literal["qa", "none", "qa_math"] = "qa",
     ) -> None:
         """Initialize the evaluator."""
         if output_evaluation == "none":

diff --git a/langchain_benchmarks/tool_usage/tasks/multiverse_math.py b/langchain_benchmarks/tool_usage/tasks/multiverse_math.py
@@ -152,6 +152,9 @@ def get_environment() -> ToolUsageEnvironment:
 solve simple math questions and ignore any innate knowledge about math.
 """
     ),
+    eval_params={
+        "output_evaluation": "qa_math",
+    },
 )
 
 # Source dataset used to create the public dataset in LangSmith

diff --git a/langchain_benchmarks/tool_usage/tasks/relational_data.py b/langchain_benchmarks/tool_usage/tasks/relational_data.py
@@ -438,4 +438,5 @@ def get_environment() -> ToolUsageEnvironment:
 Success is measured by the ability to answer the question correctly, and efficiently.
 """
     ),
+    eval_params={},  # No special evaluation parameters
 )
diff --git a/langchain_benchmarks/tool_usage/tasks/type_writer.py b/langchain_benchmarks/tool_usage/tasks/type_writer.py
@@ -18,7 +18,7 @@ class Paper:
     content: str
 
 
-def create_typer(paper: Paper) -> Callable[[], str]:
+def create_typer(paper: Paper) -> Callable[[str], str]:
     """Create a function that types the given letter."""
 
     def type_letter(letter: str) -> str:
@@ -82,6 +82,12 @@ def _read_state() -> Any:
 by the length of the string.
 """
     ),
+    eval_params={
+        # For this task, the agent's output is irrelevant
+        # what we care about is the final state of the environment
+        # (i.e., what's written on the virtual paper)
+        "output_evaluation": "none",
+    },
 )
 
 

diff --git a/langchain_benchmarks/tool_usage/tasks/type_writer_26_funcs.py b/langchain_benchmarks/tool_usage/tasks/type_writer_26_funcs.py
@@ -92,6 +92,12 @@ def _read_state() -> Any:
 given instead of a single tool that takes a letter as an argument.
 """
     ),
+    eval_params={
+        # For this task, the agent's output is irrelevant
+        # what we care about is the final state of the environment
+        # (i.e., what's written on the virtual paper)
+        "output_evaluation": "none",
+    },
 )
 
 STRINGS_TO_TYPE = [
-Original file line number
+Diff line change
@@ Expand Up / @@ -438,4 +438,5 @@ def get_environment() -> ToolUsageEnvironment: @@
     Success is measured by the ability to answer the question correctly, and efficiently.
     """
         ),
+        eval_params={},  # No special evaluation parameters
     )