Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
277 changes: 216 additions & 61 deletions docs/source/notebooks/tool_usage/intro.ipynb

Large diffs are not rendered by default.

479 changes: 182 additions & 297 deletions docs/source/notebooks/tool_usage/multiverse_math.ipynb

Large diffs are not rendered by default.

649 changes: 182 additions & 467 deletions docs/source/notebooks/tool_usage/relational_data.ipynb

Large diffs are not rendered by default.

18 changes: 7 additions & 11 deletions docs/source/notebooks/tool_usage/typewriter_1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -237,15 +237,11 @@
"id": "cd13d120-1bf9-481c-9392-c15ebdd9d77f",
"metadata": {},
"source": [
"## Agent"
]
},
{
"cell_type": "markdown",
"id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
"metadata": {},
"source": [
"Let's build an agent that we can use for evaluation."
"## Agent Factory\n",
"\n",
"For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n",
"\n",
"We'll use an `OpenAIAgentFactory` provided with LangChain Benchmarks -- look at the `intro` section to see how to define your own."
]
},
{
Expand Down Expand Up @@ -340,7 +336,7 @@
"\n",
"test_run = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
" llm_or_chain_factory=agent_factory.create,\n",
" llm_or_chain_factory=agent_factory,\n",
" evaluation=eval_config,\n",
" verbose=True,\n",
" tags=[\"gpt-3.5-turbo-16k\"],\n",
Expand Down Expand Up @@ -626,7 +622,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
"version": "3.11.4"
}
},
"nbformat": 4,
Expand Down
10 changes: 7 additions & 3 deletions docs/source/notebooks/tool_usage/typewriter_26.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,11 @@
"id": "f1d62a13-3771-460f-b131-4443f669ca3d",
"metadata": {},
"source": [
"## Agent"
"## Agent Factory\n",
"\n",
"For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n",
"\n",
"We'll use an `OpenAIAgentFactory` provided with LangChain Benchmarks -- look at the `intro` section to see how to define your own."
]
},
{
Expand Down Expand Up @@ -391,7 +395,7 @@
"\n",
"test_run = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
" llm_or_chain_factory=agent_factory.create,\n",
" llm_or_chain_factory=agent_factory,\n",
" evaluation=get_eval_config(),\n",
" verbose=True,\n",
" tags=[\"gpt-3.5-turbo-16k\"],\n",
Expand Down Expand Up @@ -757,7 +761,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
"version": "3.11.4"
}
},
"nbformat": 4,
Expand Down
6 changes: 5 additions & 1 deletion langchain_benchmarks/tool_usage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
"""Package for helping to evaluate agent runs."""
from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter
from langchain_benchmarks.tool_usage.evaluators import get_eval_config

# Please keep this list sorted!
__all__ = ["get_eval_config"]
__all__ = [
"apply_agent_executor_adapter",
"get_eval_config",
]
102 changes: 78 additions & 24 deletions langchain_benchmarks/tool_usage/agents.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""Code for creating an agent factory for evaluating tool usage tasks."""
from typing import Any
from typing import Any, Callable, Optional

from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad import format_to_openai_functions
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.runnable import Runnable, RunnablePassthrough
from langchain.schema.runnable import Runnable, RunnableLambda, RunnablePassthrough
from langchain.tools.render import format_tool_to_openai_function

from langchain_benchmarks.schema import ToolUsageTask
Expand All @@ -19,6 +19,9 @@ def _ensure_output_exists(inputs: dict) -> dict:
return inputs


# PUBLIC API


class OpenAIAgentFactory:
def __init__(
self, task: ToolUsageTask, *, model: str = "gpt-3.5-turbo-16k"
Expand All @@ -34,6 +37,10 @@ def __init__(

def create(self) -> Runnable:
"""Agent Executor"""
# For backwards compatibility
return self()

def __call__(self) -> Runnable:
llm = ChatOpenAI(
model=self.model,
temperature=0,
Expand All @@ -57,7 +64,7 @@ def create(self) -> Runnable:

runnable_agent = (
{
"input": lambda x: x["question"],
"input": lambda x: x["input"],
"agent_scratchpad": lambda x: format_to_openai_functions(
x["intermediate_steps"]
),
Expand All @@ -67,28 +74,75 @@ def create(self) -> Runnable:
| OpenAIFunctionsAgentOutputParser()
)

def _read_state(*args: Any, **kwargs: Any) -> Any:
"""Read the state of the environment."""
if env.read_state is not None:
return env.read_state()
else:
return None

runnable = (
AgentExecutor(
agent=runnable_agent,
tools=env.tools,
handle_parsing_errors=True,
return_intermediate_steps=True,
)
| _ensure_output_exists
runnable = AgentExecutor(
agent=runnable_agent,
tools=env.tools,
handle_parsing_errors=True,
return_intermediate_steps=True,
)

if env.read_state is not None:
# If the environment has a state reader, add it to the runnable
runnable = runnable | RunnablePassthrough.assign(state=_read_state)
# Returns `state` in the output if the environment has a state reader
# makes sure that `output` is always in the output
return apply_agent_executor_adapter(runnable, state_reader=env.read_state)


# PUBLIC API


def apply_agent_executor_adapter(
agent_executor: AgentExecutor,
*,
state_reader: Optional[Callable[[], Any]] = None,
) -> Runnable:
"""An adapter for the agent executor to standardize its input and output.

1) Map `question` to `input` (`question` is used in the datasets,
but `input` is used in the agent executor)
2) Ensure that `output` is always returned (will be set to "" if missing) --
note that this may be relaxed after more updates in the eval config.
3) Populate `state` key in the response of the agent with the system state
if a state reader is provided.

Args:
agent_executor: the agent executor
state_reader: A callable without parameters that if invoked will return
the state of the environment. Used to populate the 'state' key.

Returns:
a new runnable with a standardized output.
"""

def _read_state(*args: Any, **kwargs: Any) -> Any:
"""Read the state of the environment."""
if state_reader is not None:
return state_reader()
else:
return None

def _format_input(inputs: dict) -> dict:
"""Make sure that the input is always called `input`."""
if "question" not in inputs:
raise ValueError(
"Expected 'question' to be in the inputs. Found only the following "
f"keys {sorted(inputs.keys())}."
)

return runnable
inputs = inputs.copy() # Because 'question' is popped below

def __call__(self) -> Runnable:
return self.create()
if "input" not in inputs:
return {"input": inputs.pop("question"), **inputs}
return inputs

runnable = (
RunnableLambda(_format_input).with_config({"run_name": "Format Input"})
| agent_executor
| RunnableLambda(_ensure_output_exists).with_config(
{"run_name": "Ensure Output"}
)
)

if state_reader is not None:
runnable = agent_executor | RunnablePassthrough.assign(
state=_read_state
).with_config({"run_name": "Read Env State"})
return runnable
2 changes: 1 addition & 1 deletion langchain_benchmarks/tool_usage/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,6 @@ def evaluate_run(
)


def get_eval_config():
def get_eval_config() -> RunEvalConfig:
"""Returns the default evaluator for the environment."""
return RunEvalConfig(custom_evaluators=[AgentTrajectoryEvaluator()])
13 changes: 7 additions & 6 deletions langchain_benchmarks/tool_usage/tasks/relational_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"""
from typing import Callable, List, TypedDict

from langchain.tools import BaseTool, tool
from langchain.tools import StructuredTool
from langchain_core.tools import ToolException

from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask

Expand Down Expand Up @@ -187,7 +188,7 @@ def _get_user(id: int) -> dict:
for user in USER_DATA:
if user["id"] == id:
return user
raise ValueError(f"User ID {id} cannot be resolved")
raise ToolException(f"User ID {id} cannot be resolved")


def _get_location(id: int) -> dict:
Expand All @@ -202,7 +203,7 @@ def _get_location(id: int) -> dict:
for location in LOCATION_DATA:
if location["id"] == id:
return location
raise ValueError(f"Location ID {id} cannot be resolved")
raise ToolException(f"Location ID {id} cannot be resolved")


def _get_food(food_id: int) -> dict:
Expand All @@ -217,7 +218,7 @@ def _get_food(food_id: int) -> dict:
for food in FOOD_DATA:
if food["id"] == food_id:
return food
raise ValueError(f"Food ID {food_id} cannot be resolved")
raise ToolException(f"Food ID {food_id} cannot be resolved")


def get_available_functions() -> List[Callable]:
Expand Down Expand Up @@ -391,10 +392,10 @@ def get_current_user_id() -> int:
return functions


def get_tools() -> List[BaseTool]:
def get_tools() -> List[StructuredTool]:
"""Get all the available tools."""
functions = get_available_functions()
return [tool(f) for f in functions]
return [StructuredTool.from_function(f, handle_tool_error=True) for f in functions]


def get_environment() -> ToolUsageEnvironment:
Expand Down
38 changes: 28 additions & 10 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion tests/unit_tests/tool_usage/test_public_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ def test_public_api() -> None:
"""Test that the public API is correct."""
# This test will also fail if __all__ is not sorted.
# Please keep it sorted!
assert __all__ == sorted(["get_eval_config"], key=str.lower)
assert __all__ == sorted(
["apply_agent_executor_adapter", "get_eval_config"], key=str.lower
)