Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions langchain_benchmarks/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from langchain.schema import BaseRetriever
from langchain.schema.document import Document
from langchain.schema.embeddings import Embeddings
from langchain.smith import RunEvalConfig
from langchain.tools import BaseTool
from langchain_core.language_models import BaseChatModel, BaseLanguageModel
from pydantic import BaseModel
Expand Down Expand Up @@ -93,6 +94,27 @@ class ToolUsageTask(BaseTask):
instructions: str
"""Instructions for the agent/chain/llm."""

eval_params: Dict[str, Any]
"""Used to parameterize differences in the evaluation of the task.

These are passed to the standard factory method for creating an evaluator
for tool usage.

An example, for MultiVerse math the `output_evaluation` parameter is set to
`qa_math` to use a different prompt for evaluating the output of the agent.

This prompt performs better at comparing the output of the agent against
the reference output.
"""

def get_eval_config(self, **params: Any) -> RunEvalConfig:
"""Get the default evaluator for the environment."""
# Import locally to avoid potential circular imports in the future.
from langchain_benchmarks.tool_usage.evaluators import get_eval_config

finalized_params = {**self.eval_params, **params}
return get_eval_config(**finalized_params)


@dataclasses.dataclass(frozen=True)
class ExtractionTask(BaseTask):
Expand Down
2 changes: 1 addition & 1 deletion langchain_benchmarks/tool_usage/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class AgentTrajectoryEvaluator(RunEvaluator):
def __init__(
self,
eval_llm: Union[BaseLanguageModel, BaseChatModel, None] = None,
output_evaluation: Literal["qa", "none"] = "qa",
output_evaluation: Literal["qa", "none", "qa_math"] = "qa",
) -> None:
"""Initialize the evaluator."""
if output_evaluation == "none":
Expand Down
3 changes: 3 additions & 0 deletions langchain_benchmarks/tool_usage/tasks/multiverse_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,9 @@ def get_environment() -> ToolUsageEnvironment:
solve simple math questions and ignore any innate knowledge about math.
"""
),
eval_params={
"output_evaluation": "qa_math",
},
)

# Source dataset used to create the public dataset in LangSmith
Expand Down
1 change: 1 addition & 0 deletions langchain_benchmarks/tool_usage/tasks/relational_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,4 +438,5 @@ def get_environment() -> ToolUsageEnvironment:
Success is measured by the ability to answer the question correctly, and efficiently.
"""
),
eval_params={}, # No special evaluation parameters
)
8 changes: 7 additions & 1 deletion langchain_benchmarks/tool_usage/tasks/type_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Paper:
content: str


def create_typer(paper: Paper) -> Callable[[], str]:
def create_typer(paper: Paper) -> Callable[[str], str]:
"""Create a function that types the given letter."""

def type_letter(letter: str) -> str:
Expand Down Expand Up @@ -82,6 +82,12 @@ def _read_state() -> Any:
by the length of the string.
"""
),
eval_params={
# For this task, the agent's output is irrelevant
# what we care about is the final state of the environment
# (i.e., what's written on the virtual paper)
"output_evaluation": "none",
},
)


Expand Down
6 changes: 6 additions & 0 deletions langchain_benchmarks/tool_usage/tasks/type_writer_26_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ def _read_state() -> Any:
given instead of a single tool that takes a letter as an argument.
"""
),
eval_params={
# For this task, the agent's output is irrelevant
# what we care about is the final state of the environment
# (i.e., what's written on the virtual paper)
"output_evaluation": "none",
},
)

STRINGS_TO_TYPE = [
Expand Down