langchain-ai · eyurtsev · Nov 28, 2023 · Nov 27, 2023 · Nov 27, 2023 · Nov 27, 2023
diff --git a/docs/source/notebooks/tool_usage/intro.ipynb b/docs/source/notebooks/tool_usage/intro.ipynb
diff --git a/docs/source/notebooks/tool_usage/multiverse_math.ipynb b/docs/source/notebooks/tool_usage/multiverse_math.ipynb
diff --git a/docs/source/notebooks/tool_usage/relational_data.ipynb b/docs/source/notebooks/tool_usage/relational_data.ipynb
diff --git a/docs/source/notebooks/tool_usage/typewriter_1.ipynb b/docs/source/notebooks/tool_usage/typewriter_1.ipynb
@@ -237,15 +237,11 @@
    "id": "cd13d120-1bf9-481c-9392-c15ebdd9d77f",
    "metadata": {},
    "source": [
-    "## Agent"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
-   "metadata": {},
-   "source": [
-    "Let's build an agent that we can use for evaluation."
+    "## Agent Factory\n",
+    "\n",
+    "For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n",
+    "\n",
+    "We'll use an `OpenAIAgentFactory` provided with LangChain Benchmarks -- look at the `intro` section to see how to define your own."
    ]
   },
   {
@@ -340,7 +336,7 @@
     "\n",
     "test_run = client.run_on_dataset(\n",
     "    dataset_name=task.name,\n",
-    "    llm_or_chain_factory=agent_factory.create,\n",
+    "    llm_or_chain_factory=agent_factory,\n",
     "    evaluation=eval_config,\n",
     "    verbose=True,\n",
     "    tags=[\"gpt-3.5-turbo-16k\"],\n",
@@ -626,7 +622,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,

diff --git a/docs/source/notebooks/tool_usage/typewriter_26.ipynb b/docs/source/notebooks/tool_usage/typewriter_26.ipynb
@@ -252,7 +252,11 @@
    "id": "f1d62a13-3771-460f-b131-4443f669ca3d",
    "metadata": {},
    "source": [
-    "## Agent"
+    "## Agent Factory\n",
+    "\n",
+    "For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n",
+    "\n",
+    "We'll use an `OpenAIAgentFactory` provided with LangChain Benchmarks -- look at the `intro` section to see how to define your own."
    ]
   },
   {
@@ -391,7 +395,7 @@
     "\n",
     "test_run = client.run_on_dataset(\n",
     "    dataset_name=task.name,\n",
-    "    llm_or_chain_factory=agent_factory.create,\n",
+    "    llm_or_chain_factory=agent_factory,\n",
     "    evaluation=get_eval_config(),\n",
     "    verbose=True,\n",
     "    tags=[\"gpt-3.5-turbo-16k\"],\n",
@@ -757,7 +761,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,

diff --git a/langchain_benchmarks/tool_usage/__init__.py b/langchain_benchmarks/tool_usage/__init__.py
@@ -1,5 +1,9 @@
 """Package for helping to evaluate agent runs."""
+from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter
 from langchain_benchmarks.tool_usage.evaluators import get_eval_config
 
 # Please keep this list sorted!
-__all__ = ["get_eval_config"]
+__all__ = [
+    "apply_agent_executor_adapter",
+    "get_eval_config",
+]
diff --git a/langchain_benchmarks/tool_usage/agents.py b/langchain_benchmarks/tool_usage/agents.py
@@ -1,12 +1,12 @@
 """Code for creating an agent factory for evaluating tool usage tasks."""
-from typing import Any
+from typing import Any, Callable, Optional
 
 from langchain.agents import AgentExecutor
 from langchain.agents.format_scratchpad import format_to_openai_functions
 from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
-from langchain.schema.runnable import Runnable, RunnablePassthrough
+from langchain.schema.runnable import Runnable, RunnableLambda, RunnablePassthrough
 from langchain.tools.render import format_tool_to_openai_function
 
 from langchain_benchmarks.schema import ToolUsageTask
@@ -19,6 +19,9 @@ def _ensure_output_exists(inputs: dict) -> dict:
     return inputs
 
 
+# PUBLIC API
+
+
 class OpenAIAgentFactory:
     def __init__(
         self, task: ToolUsageTask, *, model: str = "gpt-3.5-turbo-16k"
@@ -34,6 +37,10 @@ def __init__(
 
     def create(self) -> Runnable:
         """Agent Executor"""
+        # For backwards compatibility
+        return self()
+
+    def __call__(self) -> Runnable:
         llm = ChatOpenAI(
             model=self.model,
             temperature=0,
@@ -57,7 +64,7 @@ def create(self) -> Runnable:
 
         runnable_agent = (
             {
-                "input": lambda x: x["question"],
+                "input": lambda x: x["input"],
                 "agent_scratchpad": lambda x: format_to_openai_functions(
                     x["intermediate_steps"]
                 ),
@@ -67,28 +74,75 @@ def create(self) -> Runnable:
             | OpenAIFunctionsAgentOutputParser()
         )
 
-        def _read_state(*args: Any, **kwargs: Any) -> Any:
-            """Read the state of the environment."""
-            if env.read_state is not None:
-                return env.read_state()
-            else:
-                return None
-
-        runnable = (
-            AgentExecutor(
-                agent=runnable_agent,
-                tools=env.tools,
-                handle_parsing_errors=True,
-                return_intermediate_steps=True,
-            )
-            | _ensure_output_exists
+        runnable = AgentExecutor(
+            agent=runnable_agent,
+            tools=env.tools,
+            handle_parsing_errors=True,
+            return_intermediate_steps=True,
         )
 
-        if env.read_state is not None:
-            # If the environment has a state reader, add it to the runnable
-            runnable = runnable | RunnablePassthrough.assign(state=_read_state)
+        # Returns `state` in the output if the environment has a state reader
+        # makes sure that `output` is always in the output
+        return apply_agent_executor_adapter(runnable, state_reader=env.read_state)
+
+
+# PUBLIC API
+
+
+def apply_agent_executor_adapter(
+    agent_executor: AgentExecutor,
+    *,
+    state_reader: Optional[Callable[[], Any]] = None,
+) -> Runnable:
+    """An adapter for the agent executor to standardize its input and output.
+
+    1) Map `question` to `input` (`question` is used in the datasets,
+       but `input` is used in the agent executor)
+    2) Ensure that `output` is always returned (will be set to "" if missing) --
+       note that this may be relaxed after more updates in the eval config.
+    3) Populate `state` key in the response of the agent with the system state
+       if a state reader is provided.
+
+    Args:
+        agent_executor: the agent executor
+        state_reader: A callable without parameters that if invoked will return
+                      the state of the environment. Used to populate the 'state' key.
+
+    Returns:
+        a new runnable with a standardized output.
+    """
+
+    def _read_state(*args: Any, **kwargs: Any) -> Any:
+        """Read the state of the environment."""
+        if state_reader is not None:
+            return state_reader()
+        else:
+            return None
+
+    def _format_input(inputs: dict) -> dict:
+        """Make sure that the input is always called `input`."""
+        if "question" not in inputs:
+            raise ValueError(
+                "Expected 'question' to be in the inputs. Found only the following "
+                f"keys {sorted(inputs.keys())}."
+            )
 
-        return runnable
+        inputs = inputs.copy()  # Because 'question' is popped below
 
-    def __call__(self) -> Runnable:
-        return self.create()
+        if "input" not in inputs:
+            return {"input": inputs.pop("question"), **inputs}
+        return inputs
+
+    runnable = (
+        RunnableLambda(_format_input).with_config({"run_name": "Format Input"})
+        | agent_executor
+        | RunnableLambda(_ensure_output_exists).with_config(
+            {"run_name": "Ensure Output"}
+        )
+    )
+
+    if state_reader is not None:
+        runnable = agent_executor | RunnablePassthrough.assign(
+            state=_read_state
+        ).with_config({"run_name": "Read Env State"})
+    return runnable
diff --git a/langchain_benchmarks/tool_usage/evaluators.py b/langchain_benchmarks/tool_usage/evaluators.py
@@ -131,6 +131,6 @@ def evaluate_run(
         )
 
 
-def get_eval_config():
+def get_eval_config() -> RunEvalConfig:
     """Returns the default evaluator for the environment."""
     return RunEvalConfig(custom_evaluators=[AgentTrajectoryEvaluator()])
diff --git a/langchain_benchmarks/tool_usage/tasks/relational_data.py b/langchain_benchmarks/tool_usage/tasks/relational_data.py
@@ -10,7 +10,8 @@
 """
 from typing import Callable, List, TypedDict
 
-from langchain.tools import BaseTool, tool
+from langchain.tools import StructuredTool
+from langchain_core.tools import ToolException
 
 from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask
 
@@ -187,7 +188,7 @@ def _get_user(id: int) -> dict:
     for user in USER_DATA:
         if user["id"] == id:
             return user
-    raise ValueError(f"User ID {id} cannot be resolved")
+    raise ToolException(f"User ID {id} cannot be resolved")
 
 
 def _get_location(id: int) -> dict:
@@ -202,7 +203,7 @@ def _get_location(id: int) -> dict:
     for location in LOCATION_DATA:
         if location["id"] == id:
             return location
-    raise ValueError(f"Location ID {id} cannot be resolved")
+    raise ToolException(f"Location ID {id} cannot be resolved")
 
 
 def _get_food(food_id: int) -> dict:
@@ -217,7 +218,7 @@ def _get_food(food_id: int) -> dict:
     for food in FOOD_DATA:
         if food["id"] == food_id:
             return food
-    raise ValueError(f"Food ID {food_id} cannot be resolved")
+    raise ToolException(f"Food ID {food_id} cannot be resolved")
 
 
 def get_available_functions() -> List[Callable]:
@@ -391,10 +392,10 @@ def get_current_user_id() -> int:
     return functions
 
 
-def get_tools() -> List[BaseTool]:
+def get_tools() -> List[StructuredTool]:
     """Get all the available tools."""
     functions = get_available_functions()
-    return [tool(f) for f in functions]
+    return [StructuredTool.from_function(f, handle_tool_error=True) for f in functions]
 
 
 def get_environment() -> ToolUsageEnvironment:

diff --git a/poetry.lock b/poetry.lock
diff --git a/tests/unit_tests/tool_usage/test_public_api.py b/tests/unit_tests/tool_usage/test_public_api.py
@@ -5,4 +5,6 @@ def test_public_api() -> None:
     """Test that the public API is correct."""
     # This test will also fail if __all__ is not sorted.
     # Please keep it sorted!
-    assert __all__ == sorted(["get_eval_config"], key=str.lower)
+    assert __all__ == sorted(
+        ["apply_agent_executor_adapter", "get_eval_config"], key=str.lower
+    )