Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions langchain_benchmarks/extraction/email_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ def get_eval_config(eval_llm: BaseModel) -> RunEvalConfig:
)


EmailTask = ExtractionTask(
id=4, # To be deprecated
EMAIL_EXTRACTION_TASK = ExtractionTask(
name="Email Extraction",
dataset_id="https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d",
model=Email,
Expand Down
190 changes: 9 additions & 181 deletions langchain_benchmarks/registration.py
Original file line number Diff line number Diff line change
@@ -1,193 +1,21 @@
"""Registry of environments for ease of access."""
import dataclasses
from typing import Sequence, Union

from tabulate import tabulate

from langchain_benchmarks.schema import ToolUsageTask, ExtractionTask
from langchain_benchmarks.extraction import email_task
from langchain_benchmarks.tool_usage.environments import (
relational_data,
type_writer,
from langchain_benchmarks.schema import Registry
from langchain_benchmarks.tool_usage import (
type_writer_26_funcs,
type_writer,
relational_data,
multiverse_math,
)


@dataclasses.dataclass(frozen=True)
class Registry:
tasks: Sequence[ToolUsageTask]

def get_task(self, name_or_id: Union[int, str]) -> ToolUsageTask:
"""Get the environment with the given name."""
for env in self.tasks:
if env.name == name_or_id or env.id == name_or_id:
return env
raise ValueError(f"Unknown task {name_or_id}")

def __post_init__(self) -> None:
"""Validate that all the tasks have unique names and IDs."""
seen_names = set()
seen_ids = set()
for task in self.tasks:
if task.name in seen_names:
raise ValueError(
f"Duplicate task name {task.name}. " f"Task names must be unique."
)
seen_names.add(task.name)
if task.id in seen_ids:
raise ValueError(
f"Duplicate task ID {task.id}. " f"Task IDs must be unique."
)

def _repr_html_(self) -> str:
"""Return a HTML representation of the registry."""
headers = [
"ID",
"Name",
"Dataset ID",
"Description",
]
table = [
[
env.id,
env.name,
env.dataset_id,
env.description,
]
for env in self.tasks
]
return tabulate(table, headers=headers, tablefmt="html")

def __getitem__(self, key: Union[int, str]) -> ToolUsageTask:
"""Get an environment from the registry."""
if isinstance(key, slice):
raise NotImplementedError("Slicing is not supported.")
elif isinstance(key, (int, str)):
# If key is an integer, return the corresponding environment
return self.get_task(key)
else:
raise TypeError("Key must be an integer or a slice.")


# Using lower case naming to make a bit prettier API when used in a notebook
registry = Registry(
tasks=[
ToolUsageTask(
id=0,
name="Tool Usage - Relational Data",
dataset_id=relational_data.DATASET_ID,
create_environment=relational_data.get_environment,
instructions=(
"""\
Please answer the user's question by using the tools provided. Do not guess the \
answer. Keep in mind that entities like users,foods and locations have both a \
name and an ID, which are not the same."""
),
description=(
"""\
Environment with fake data about users and their locations and favorite foods.

The environment provides a set of tools that can be used to query the data.

The objective of this task is to evaluate the ability to use the provided tools \
to answer questions about relational data.

The dataset contains 21 examples of varying difficulty. The difficulty is measured \
by the number of tools that need to be used to answer the question.

Each example is composed of a question, a reference answer, and \
information about the sequence in which tools should be used to answer \
the question.

Success is measured by the ability to answer the question correctly, and efficiently.
"""
),
),
ToolUsageTask(
id=1,
name="Tool Usage - Typewriter (1 func)",
dataset_id="placeholder",
create_environment=type_writer.get_environment,
instructions=(
"Repeat the given string by using the provided tools. "
"Do not write anything else or provide any explanations. "
"For example, if the string is 'abc', you must invoke the tools "
"'a', 'b', and 'c' in that order. "
"Please invoke the function with a single letter at a time."
),
description=(
"""\
Environment with a single function that accepts a single letter as input, and \
"prints" it on a piece of paper.

The objective of this task is to evaluate the ability to use the provided \
tools to repeat a given input string.

For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \
in that order.

The dataset includes examples of varying difficulty. The difficulty is measured \
by the length of the string.
"""
),
),
ToolUsageTask(
id=2,
name="Tool Usage - Typewriter",
dataset_id="placeholder",
create_environment=type_writer_26_funcs.get_environment,
instructions=(
"Repeat the given string by using the provided tools. "
"Do not write anything else or provide any explanations. "
"For example, if the string is 'abc', you must invoke the tools "
"'a', 'b', and 'c' in that order. "
"Please invoke the functions without any arguments."
),
description=(
"""\
Environment with 26 functions each representing a letter of the alphabet.

In this variation of the typewriter task, there are 26 parameterless functions, where \
each function represents a letter of the alphabet (instead of a single function that \
takes a letter as an argument).

The object is to evaluate the ability of use the functions to repeat the given string.

For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \
in that order.

The dataset includes examples of varying difficulty. The difficulty is measured \
by the length of the string.
"""
),
),
ToolUsageTask(
id=3,
name="Multiverse Math",
dataset_id="placeholder",
create_environment=multiverse_math.get_environment,
instructions=(
"You are requested to solve math questions in an alternate "
"mathematical universe. The rules of association, commutativity, "
"and distributivity still apply, but the operations have been "
"altered to yield different results than expected. Solve the "
"given math questions using the provided tools. "
"Do not guess the answer."
),
description=(
"""\
An environment that contains a few basic math operations, but with altered results.

For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \
The basic operations retain some basic properties, such as commutativity, \
associativity, and distributivity; however, the results are different than expected.

The objective of this task is to evaluate the ability to use the provided tools to \
solve simple math questions and ignore any innate knowledge about math.
"""
),
),
email_task.EmailTask,
type_writer.TYPE_WRITER_TASK,
type_writer_26_funcs.TYPE_WRITER_26_FUNCS_TASK,
relational_data.RELATIONAL_DATA_TASK,
multiverse_math.MULTIVERSE_MATH,
email_task.EMAIL_EXTRACTION_TASK,
]
)
60 changes: 57 additions & 3 deletions langchain_benchmarks/schema.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Schema for the Langchain Benchmarks."""
import dataclasses
from typing import List, Callable, Any, Optional, Type
from typing import List, Callable, Any, Optional, Type, Union

from langchain.tools import BaseTool
from pydantic import BaseModel
Expand All @@ -22,8 +22,6 @@ class ToolUsageEnvironment:
class BaseTask:
"""A definition of a task."""

id: int
"""The ID of the environment."""
name: str
"""The name of the environment."""

Expand Down Expand Up @@ -73,3 +71,59 @@ class ExtractionTask(BaseTask):

model: Type[BaseModel]
"""Get the model for the task."""


@dataclasses.dataclass(frozen=False)
class Registry:
tasks: List[BaseTask]

def get_task(self, name_or_id: Union[int, str]) -> BaseTask:
"""Get the environment with the given name."""
if isinstance(name_or_id, int):
return self.tasks[name_or_id]

for env in self.tasks:
if env.name == name_or_id:
return env
raise ValueError(f"Unknown task {name_or_id}")

def __post_init__(self) -> None:
"""Validate that all the tasks have unique names and IDs."""
seen_names = set()
for task in self.tasks:
if task.name in seen_names:
raise ValueError(
f"Duplicate task name {task.name}. " f"Task names must be unique."
)

def _repr_html_(self) -> str:
"""Return an HTML representation of the registry."""
headers = [
"Name",
"Dataset ID",
"Description",
]
table = [
[
env.name,
env.dataset_id,
env.description,
]
for env in self.tasks
]
return tabulate(table, headers=headers, tablefmt="html")

def __getitem__(self, key: Union[int, str]) -> BaseTask:
"""Get an environment from the registry."""
if isinstance(key, slice):
raise NotImplementedError("Slicing is not supported.")
elif isinstance(key, (int, str)):
# If key is an integer, return the corresponding environment
return self.get_task(key)
else:
raise TypeError("Key must be an integer or a slice.")

def add(self, task: BaseTask) -> None:
if not isinstance(task, BaseTask):
raise TypeError("Only tasks can be added to the registry.")
self.tasks.append(task)
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from langchain.tools import tool, BaseTool

from langchain_benchmarks.schema import ToolUsageEnvironment
from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask


def multiply(a: float, b: float) -> float:
Expand Down Expand Up @@ -86,3 +86,30 @@ def get_environment() -> ToolUsageEnvironment:
tools=tools,
read_state=None,
)


MULTIVERSE_MATH = ToolUsageTask(
name="Multiverse Math",
dataset_id="placeholder",
create_environment=get_environment,
instructions=(
"You are requested to solve math questions in an alternate "
"mathematical universe. The rules of association, commutativity, "
"and distributivity still apply, but the operations have been "
"altered to yield different results than expected. Solve the "
"given math questions using the provided tools. "
"Do not guess the answer."
),
description=(
"""\
An environment that contains a few basic math operations, but with altered results.

For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \
The basic operations retain some basic properties, such as commutativity, \
associativity, and distributivity; however, the results are different than expected.

The objective of this task is to evaluate the ability to use the provided tools to \
solve simple math questions and ignore any innate knowledge about math.
"""
),
)
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from langchain.tools import BaseTool, tool

from langchain_benchmarks.schema import ToolUsageEnvironment
from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask

USER_DATA = [
# IDs are not consecutive to prevent agents from guessing the ID
Expand Down Expand Up @@ -407,3 +407,34 @@ def get_environment() -> ToolUsageEnvironment:

# ID of a dataset that contains the questions and references
DATASET_ID = "e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5" # ID of Agent Gym: E01 Alpha

RELATIONAL_DATA_TASK = ToolUsageTask(
name="Tool Usage - Relational Data",
dataset_id=DATASET_ID,
create_environment=get_environment,
instructions=(
"""\
Please answer the user's question by using the tools provided. Do not guess the \
answer. Keep in mind that entities like users,foods and locations have both a \
name and an ID, which are not the same."""
),
description=(
"""\
Environment with fake data about users and their locations and favorite foods.

The environment provides a set of tools that can be used to query the data.

The objective of this task is to evaluate the ability to use the provided tools \
to answer questions about relational data.

The dataset contains 21 examples of varying difficulty. The difficulty is measured \
by the number of tools that need to be used to answer the question.

Each example is composed of a question, a reference answer, and \
information about the sequence in which tools should be used to answer \
the question.

Success is measured by the ability to answer the question correctly, and efficiently.
"""
),
)
Loading