Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,092 changes: 715 additions & 377 deletions docs/source/notebooks/tool_usage/multiverse_math.ipynb

Large diffs are not rendered by default.

385 changes: 309 additions & 76 deletions docs/source/notebooks/tool_usage/relational_data.ipynb

Large diffs are not rendered by default.

1,212 changes: 1,044 additions & 168 deletions docs/source/notebooks/tool_usage/typewriter_1.ipynb

Large diffs are not rendered by default.

528 changes: 66 additions & 462 deletions docs/source/notebooks/tool_usage/typewriter_26.ipynb

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions langchain_benchmarks/tool_usage/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def _read_state(*args: Any, **kwargs: Any) -> Any:

def _format_input(inputs: dict) -> dict:
"""Make sure that the input is always called `input`."""

if "question" not in inputs:
raise ValueError(
"Expected 'question' to be in the inputs. Found only the following "
Expand All @@ -142,7 +143,7 @@ def _format_input(inputs: dict) -> dict:
)

if state_reader is not None:
runnable = agent_executor | RunnablePassthrough.assign(
state=_read_state
).with_config({"run_name": "Read Env State"})
runnable = runnable | RunnablePassthrough.assign(state=_read_state).with_config(
{"run_name": "Read Env State"}
)
return runnable
78 changes: 65 additions & 13 deletions langchain_benchmarks/tool_usage/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,23 @@
* Agents must output "intermediate_steps" in their run outputs.
* The dataset must have "expected_steps" in its outputs.
"""
from typing import Optional
from typing import Literal, Optional, Union

from langchain.callbacks.manager import collect_runs
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import EvaluatorType, load_evaluator
from langchain.evaluation.schema import StringEvaluator
from langchain.smith import RunEvalConfig
from langchain_core.language_models import BaseChatModel, BaseLanguageModel
from langsmith.evaluation.evaluator import (
EvaluationResult,
EvaluationResults,
RunEvaluator,
)
from langsmith.schemas import Example, Run

from langchain_benchmarks.tool_usage.prompts import QA_TEMPLATE_FOR_MULTIVERSE_MATH


def compare_outputs(
run_outputs: dict,
Expand Down Expand Up @@ -97,16 +100,41 @@ def compare_outputs(
class AgentTrajectoryEvaluator(RunEvaluator):
"""An evaluator that can be used in conjunction with a standard agent interface."""

def __init__(self) -> None:
def __init__(
self,
eval_llm: Union[BaseLanguageModel, BaseChatModel, None] = None,
output_evaluation: Literal["qa", "none"] = "qa",
) -> None:
"""Initialize the evaluator."""
eval_llm = ChatOpenAI(
model="gpt-4",
temperature=0,
model_kwargs={"seed": 42},
max_retries=1,
request_timeout=60,
)
self.qa_evaluator = load_evaluator(EvaluatorType.QA, llm=eval_llm)
if output_evaluation == "none":
if eval_llm is not None:
raise ValueError(
"If output_evaluation is 'none', then eval_llm must be None"
)
qa_evaluator = None
else:
eval_llm = eval_llm or ChatOpenAI(
model="gpt-4",
temperature=0,
model_kwargs={"seed": 42},
max_retries=1,
request_timeout=60,
)
if output_evaluation == "qa":
qa_evaluator = load_evaluator(EvaluatorType.QA, llm=eval_llm)
elif output_evaluation == "qa_math":
qa_evaluator = load_evaluator(
EvaluatorType.QA,
llm=eval_llm,
prompt=QA_TEMPLATE_FOR_MULTIVERSE_MATH,
)
else:
raise ValueError(
f"output_evaluation must be one of 'qa' or 'none', "
f"got {output_evaluation}"
)

self.qa_evaluator = qa_evaluator

def evaluate_run(
self, run: Run, example: Optional[Example] = None
Expand Down Expand Up @@ -137,6 +165,30 @@ def evaluate_run(
)


def get_eval_config() -> RunEvalConfig:
"""Returns the default evaluator for the environment."""
return RunEvalConfig(custom_evaluators=[AgentTrajectoryEvaluator()])
def get_eval_config(
*,
eval_llm: Union[BaseLanguageModel, BaseChatModel, None] = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need both chat and base language model?

output_evaluation: Literal["qa", "qa_math", "none"] = "qa",
) -> RunEvalConfig:
"""Get the default evaluator for the environment.

Args:
eval_llm: The language model to use for grading the `output` response
output_evaluation: how to evaluate the output of the agent.
- 'qa' will use the qa evaluator to compare the output to the reference.
- 'qa_math' will use the qa evaluator to compare the output to the reference
using a prompt that better works for multiverse math.
- 'none' will not evaluate the output of the agent -- in some cases
it's only relevant to evaluate how the agent used tools, not what
its output.

Returns:
A RunEvalConfig that can be used to evaluate the environment
"""
return RunEvalConfig(
custom_evaluators=[
AgentTrajectoryEvaluator(
eval_llm=eval_llm, output_evaluation=output_evaluation
)
]
)
24 changes: 24 additions & 0 deletions langchain_benchmarks/tool_usage/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from langchain_core.prompts import PromptTemplate

MATH_TEMPLATE = """\
You live in an alternate universe. Do not assume that you know anything.
You are a teacher grading a quiz.
You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either CORRECT or INCORRECT.

Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: CORRECT or INCORRECT here

Given that you live in an alternate universe the TRUE answer may be different from what you expect. That's OK!

Grade the student answers based ONLY on whether it matches the TRUE answer. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin!

QUESTION: {query}
STUDENT ANSWER: {result}
TRUE ANSWER: {answer}
GRADE:"""
QA_TEMPLATE_FOR_MULTIVERSE_MATH = PromptTemplate(
input_variables=["result", "answer"], template=MATH_TEMPLATE
)
6 changes: 3 additions & 3 deletions langchain_benchmarks/tool_usage/tasks/multiverse_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,8 @@ def get_environment() -> ToolUsageEnvironment:
"expected_steps": ["log", "multiply"],
},
{
"question": "calculate sqrt of 101 to 4 digits of precision",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we fine keeping multiple sources of truth?

"answer": round(power(101, 0.4), 4),
"question": "calculate 101 to the power of 0.5 to 4 digits of precision",
"answer": round(power(101, 0.5), 4),
"expected_steps": ["power", "round"],
},
{
Expand All @@ -207,7 +207,7 @@ def get_environment() -> ToolUsageEnvironment:
"after calculating the sin of 1.5 radians, divide "
"the result by cos of 1.5 radians"
),
"answer": sin(1.5) / cos(1.5),
"answer": divide(sin(1.5), cos(1.5)),
"expected_steps": ["sin", "cos", "divide"],
},
{
Expand Down
18 changes: 9 additions & 9 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.