Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
695 changes: 534 additions & 161 deletions docs/source/notebooks/tool_usage/multiverse_math.ipynb

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions docs/source/notebooks/tool_usage/relational_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@
"source": [
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR"
"from langchain_benchmarks.tool_usage import get_eval_config"
]
},
{
Expand Down Expand Up @@ -361,10 +361,11 @@
}
],
"source": [
"eval_config = get_eval_config()\n",
"test_run = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
" llm_or_chain_factory=agent_factory.create,\n",
" evaluation=STANDARD_AGENT_EVALUATOR,\n",
" evaluation=eval_config,\n",
" verbose=True,\n",
" tags=[\"openai-functions\"],\n",
")"
Expand Down
5 changes: 3 additions & 2 deletions docs/source/notebooks/tool_usage/typewriter_1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -333,14 +333,15 @@
"source": [
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
"from langchain_benchmarks.tool_usage import get_eval_config\n",
"\n",
"client = Client()\n",
"eval_config = get_eval_config()\n",
"\n",
"test_run = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
" llm_or_chain_factory=agent_factory.create,\n",
" evaluation=STANDARD_AGENT_EVALUATOR,\n",
" evaluation=eval_config,\n",
" verbose=True,\n",
" tags=[\"gpt-3.5-turbo-16k\"],\n",
")"
Expand Down
4 changes: 2 additions & 2 deletions docs/source/notebooks/tool_usage/typewriter_26.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -385,14 +385,14 @@
"source": [
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
"from langchain_benchmarks.tool_usage import get_eval_config\n",
"\n",
"client = Client()\n",
"\n",
"test_run = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
" llm_or_chain_factory=agent_factory.create,\n",
" evaluation=STANDARD_AGENT_EVALUATOR,\n",
" evaluation=get_eval_config(),\n",
" verbose=True,\n",
" tags=[\"gpt-3.5-turbo-16k\"],\n",
")"
Expand Down
4 changes: 2 additions & 2 deletions langchain_benchmarks/tool_usage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Package for helping to evaluate agent runs."""
from langchain_benchmarks.tool_usage.evaluators import STANDARD_AGENT_EVALUATOR
from langchain_benchmarks.tool_usage.evaluators import get_eval_config

# Please keep this list sorted!
__all__ = ["STANDARD_AGENT_EVALUATOR"]
__all__ = ["get_eval_config"]
72 changes: 47 additions & 25 deletions langchain_benchmarks/tool_usage/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
"""
from typing import Optional

from langchain.evaluation import EvaluatorType
from langchain.callbacks.manager import collect_runs
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import EvaluatorType, load_evaluator
from langchain.evaluation.schema import StringEvaluator
from langchain.smith import RunEvalConfig
from langsmith.evaluation.evaluator import (
EvaluationResult,
Expand All @@ -17,7 +20,13 @@
from langsmith.schemas import Example, Run


def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResults:
def compare_outputs(
run_outputs: dict,
example_outputs: dict,
run_inputs: dict,
*,
qa_evaluator: Optional[StringEvaluator] = None,
) -> EvaluationResults:
"""Compare the outputs of a run to the expected outputs."""
intermediate_steps = run_outputs["intermediate_steps"]
# Since we are comparing to the tool names, we now need to get that
Expand All @@ -31,12 +40,12 @@ def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResul

if order_matters:
# If the order matters trajectory must be the same as expected trajectory
score = int(trajectory == expected_trajectory)
trajectory_score = int(trajectory == expected_trajectory)
else:
# If order does not matter, then we compare the trajectories after sorting
# them. This will make sure that the number of times each tool is used
# is the same, but the order does not matter.
score = int(sorted(trajectory) == sorted(expected_trajectory))
trajectory_score = int(sorted(trajectory) == sorted(expected_trajectory))

# Just score it based on whether it is correct or not
step_fraction = len(trajectory) / len(expected_trajectory)
Expand All @@ -45,7 +54,8 @@ def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResul
results = [
EvaluationResult(
key="Intermediate steps correctness",
score=score,
score=trajectory_score,
comment=f"Order matters={order_matters}",
),
EvaluationResult(
key="# steps / # expected steps",
Expand All @@ -65,12 +75,33 @@ def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResul
)
)

if "output" in run_outputs and qa_evaluator:
output = run_outputs["output"]
with collect_runs() as cb:
qa_results = qa_evaluator.evaluate_strings(
prediction=output,
reference=example_outputs["reference"],
input=run_inputs["question"],
)
results.append(
EvaluationResult(
key="correctness",
score=qa_results["score"],
source_run_id=cb.traced_runs[0].id,
)
)

return {"results": results}


class AgentTrajectoryEvaluator(RunEvaluator):
"""An evaluator that can be used in conjunction with a standard agent interface."""

def __init__(self) -> None:
"""Initialize the evaluator."""
eval_llm = ChatOpenAI(model="gpt-4", temperature=0, model_kwargs={"seed": 42})
self.qa_evaluator = load_evaluator(EvaluatorType.QA, llm=eval_llm)

def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> EvaluationResults:
Expand All @@ -92,23 +123,14 @@ def evaluate_run(
"Please make sure that your dataset contains 'expected_steps'"
)

return compare_outputs(run.outputs, example.outputs)


STANDARD_AGENT_EVALUATOR = RunEvalConfig(
# Evaluators can either be an evaluator type
# (e.g., "qa", "criteria", "embedding_distance", etc.) or a
# configuration for that evaluator
evaluators=[
# Measures whether a QA response is "Correct", based on a reference answer
# You can also select via the raw string "qa"
EvaluatorType.QA
],
# You can add custom StringEvaluator or RunEvaluator objects
# here as well, which will automatically be
# applied to each prediction. Check out the docs for examples.
custom_evaluators=[AgentTrajectoryEvaluator()],
# We now need to specify this because we have multiple outputs in our dataset
reference_key="reference",
prediction_key="output",
)
return compare_outputs(
run.outputs,
example.outputs,
qa_evaluator=self.qa_evaluator,
run_inputs=run.inputs,
)


def get_eval_config():
"""Returns the default evaluator for the environment."""
return RunEvalConfig(custom_evaluators=[AgentTrajectoryEvaluator()])
2 changes: 1 addition & 1 deletion tests/unit_tests/tool_usage/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@
)
def test_compare_outputs(run_outputs, example_outputs, expected_results):
"""Test compare outputs."""
evaluation_results = compare_outputs(run_outputs, example_outputs)
evaluation_results = compare_outputs(run_outputs, example_outputs, run_inputs={})
assert {
result.key: result.score for result in evaluation_results["results"]
} == expected_results
2 changes: 1 addition & 1 deletion tests/unit_tests/tool_usage/test_public_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ def test_public_api() -> None:
"""Test that the public API is correct."""
# This test will also fail if __all__ is not sorted.
# Please keep it sorted!
assert __all__ == sorted(["STANDARD_AGENT_EVALUATOR"], key=str.lower)
assert __all__ == sorted(["get_eval_config"], key=str.lower)