langchain-ai · eyurtsev · Nov 20, 2023 · Nov 20, 2023
diff --git a/langchain_benchmarks/extraction/email_task.py b/langchain_benchmarks/extraction/email_task.py
@@ -58,8 +58,7 @@ def get_eval_config(eval_llm: BaseModel) -> RunEvalConfig:
     )
 
 
-EmailTask = ExtractionTask(
-    id=4,  # To be deprecated
+EMAIL_EXTRACTION_TASK = ExtractionTask(
     name="Email Extraction",
     dataset_id="https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d",
     model=Email,

diff --git a/langchain_benchmarks/registration.py b/langchain_benchmarks/registration.py
@@ -1,193 +1,21 @@
 """Registry of environments for ease of access."""
-import dataclasses
-from typing import Sequence, Union
 
-from tabulate import tabulate
-
-from langchain_benchmarks.schema import ToolUsageTask, ExtractionTask
 from langchain_benchmarks.extraction import email_task
-from langchain_benchmarks.tool_usage.environments import (
-    relational_data,
-    type_writer,
+from langchain_benchmarks.schema import Registry
+from langchain_benchmarks.tool_usage import (
     type_writer_26_funcs,
+    type_writer,
+    relational_data,
     multiverse_math,
 )
 
-
-@dataclasses.dataclass(frozen=True)
-class Registry:
-    tasks: Sequence[ToolUsageTask]
-
-    def get_task(self, name_or_id: Union[int, str]) -> ToolUsageTask:
-        """Get the environment with the given name."""
-        for env in self.tasks:
-            if env.name == name_or_id or env.id == name_or_id:
-                return env
-        raise ValueError(f"Unknown task {name_or_id}")
-
-    def __post_init__(self) -> None:
-        """Validate that all the tasks have unique names and IDs."""
-        seen_names = set()
-        seen_ids = set()
-        for task in self.tasks:
-            if task.name in seen_names:
-                raise ValueError(
-                    f"Duplicate task name {task.name}. " f"Task names must be unique."
-                )
-            seen_names.add(task.name)
-            if task.id in seen_ids:
-                raise ValueError(
-                    f"Duplicate task ID {task.id}. " f"Task IDs must be unique."
-                )
-
-    def _repr_html_(self) -> str:
-        """Return a HTML representation of the registry."""
-        headers = [
-            "ID",
-            "Name",
-            "Dataset ID",
-            "Description",
-        ]
-        table = [
-            [
-                env.id,
-                env.name,
-                env.dataset_id,
-                env.description,
-            ]
-            for env in self.tasks
-        ]
-        return tabulate(table, headers=headers, tablefmt="html")
-
-    def __getitem__(self, key: Union[int, str]) -> ToolUsageTask:
-        """Get an environment from the registry."""
-        if isinstance(key, slice):
-            raise NotImplementedError("Slicing is not supported.")
-        elif isinstance(key, (int, str)):
-            # If key is an integer, return the corresponding environment
-            return self.get_task(key)
-        else:
-            raise TypeError("Key must be an integer or a slice.")
-
-
 # Using lower case naming to make a bit prettier API when used in a notebook
 registry = Registry(
     tasks=[
-        ToolUsageTask(
-            id=0,
-            name="Tool Usage - Relational Data",
-            dataset_id=relational_data.DATASET_ID,
-            create_environment=relational_data.get_environment,
-            instructions=(
-                """\
-Please answer the user's question by using the tools provided. Do not guess the \
-answer. Keep in mind that entities like users,foods and locations have both a \
-name and an ID, which are not the same."""
-            ),
-            description=(
-                """\
-Environment with fake data about users and their locations and favorite foods.
-
-The environment provides a set of tools that can be used to query the data.
-
-The objective of this task is to evaluate the ability to use the provided tools \
-to answer questions about relational data.
-
-The dataset contains 21 examples of varying difficulty. The difficulty is measured \
-by the number of tools that need to be used to answer the question.
-
-Each example is composed of a question, a reference answer, and \
-information about the sequence in which tools should be used to answer \
-the question.
-
-Success is measured by the ability to answer the question correctly, and efficiently.
-"""
-            ),
-        ),
-        ToolUsageTask(
-            id=1,
-            name="Tool Usage - Typewriter (1 func)",
-            dataset_id="placeholder",
-            create_environment=type_writer.get_environment,
-            instructions=(
-                "Repeat the given string by using the provided tools. "
-                "Do not write anything else or provide any explanations. "
-                "For example, if the string is 'abc', you must invoke the tools "
-                "'a', 'b', and 'c' in that order. "
-                "Please invoke the function with a single letter at a time."
-            ),
-            description=(
-                """\
-Environment with a single function that accepts a single letter as input, and \
-"prints" it on a piece of paper.
-
-The objective of this task is to evaluate the ability to use the provided \
- tools to repeat a given input string.
-
-For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \
-in that order.
-
-The dataset includes examples of varying difficulty. The difficulty is measured \
-by the length of the string.
-"""
-            ),
-        ),
-        ToolUsageTask(
-            id=2,
-            name="Tool Usage - Typewriter",
-            dataset_id="placeholder",
-            create_environment=type_writer_26_funcs.get_environment,
-            instructions=(
-                "Repeat the given string by using the provided tools. "
-                "Do not write anything else or provide any explanations. "
-                "For example, if the string is 'abc', you must invoke the tools "
-                "'a', 'b', and 'c' in that order. "
-                "Please invoke the functions without any arguments."
-            ),
-            description=(
-                """\
-Environment with 26 functions each representing a letter of the alphabet.
-
-In this variation of the typewriter task, there are 26 parameterless functions, where \
-each function represents a letter of the alphabet (instead of a single function that \
-takes a letter as an argument).
-
-The object is to evaluate the ability of use the functions to repeat the given string.
-
-For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \
-in that order.
-
-The dataset includes examples of varying difficulty. The difficulty is measured \
-by the length of the string.
-"""
-            ),
-        ),
-        ToolUsageTask(
-            id=3,
-            name="Multiverse Math",
-            dataset_id="placeholder",
-            create_environment=multiverse_math.get_environment,
-            instructions=(
-                "You are requested to solve math questions in an alternate "
-                "mathematical universe. The rules of association, commutativity, "
-                "and distributivity still apply, but the operations have been "
-                "altered to yield different results than expected. Solve the "
-                "given math questions using the provided tools. "
-                "Do not guess the answer."
-            ),
-            description=(
-                """\
-An environment that contains a few basic math operations, but with altered results.
-
-For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \
-The basic operations retain some basic properties, such as commutativity, \
-associativity, and distributivity; however, the results are different than expected.
-
-The objective of this task is to evaluate the ability to use the provided tools to \
-solve simple math questions and ignore any innate knowledge about math.
-"""
-            ),
-        ),
-        email_task.EmailTask,
+        type_writer.TYPE_WRITER_TASK,
+        type_writer_26_funcs.TYPE_WRITER_26_FUNCS_TASK,
+        relational_data.RELATIONAL_DATA_TASK,
+        multiverse_math.MULTIVERSE_MATH,
+        email_task.EMAIL_EXTRACTION_TASK,
     ]
 )
diff --git a/langchain_benchmarks/schema.py b/langchain_benchmarks/schema.py
@@ -1,6 +1,6 @@
 """Schema for the Langchain Benchmarks."""
 import dataclasses
-from typing import List, Callable, Any, Optional, Type
+from typing import List, Callable, Any, Optional, Type, Union
 
 from langchain.tools import BaseTool
 from pydantic import BaseModel
@@ -22,8 +22,6 @@ class ToolUsageEnvironment:
 class BaseTask:
     """A definition of a task."""
 
-    id: int
-    """The ID of the environment."""
     name: str
     """The name of the environment."""
 
@@ -73,3 +71,59 @@ class ExtractionTask(BaseTask):
 
     model: Type[BaseModel]
     """Get the model for the task."""
+
+
+@dataclasses.dataclass(frozen=False)
+class Registry:
+    tasks: List[BaseTask]
+
+    def get_task(self, name_or_id: Union[int, str]) -> BaseTask:
+        """Get the environment with the given name."""
+        if isinstance(name_or_id, int):
+            return self.tasks[name_or_id]
+
+        for env in self.tasks:
+            if env.name == name_or_id:
+                return env
+        raise ValueError(f"Unknown task {name_or_id}")
+
+    def __post_init__(self) -> None:
+        """Validate that all the tasks have unique names and IDs."""
+        seen_names = set()
+        for task in self.tasks:
+            if task.name in seen_names:
+                raise ValueError(
+                    f"Duplicate task name {task.name}. " f"Task names must be unique."
+                )
+
+    def _repr_html_(self) -> str:
+        """Return an HTML representation of the registry."""
+        headers = [
+            "Name",
+            "Dataset ID",
+            "Description",
+        ]
+        table = [
+            [
+                env.name,
+                env.dataset_id,
+                env.description,
+            ]
+            for env in self.tasks
+        ]
+        return tabulate(table, headers=headers, tablefmt="html")
+
+    def __getitem__(self, key: Union[int, str]) -> BaseTask:
+        """Get an environment from the registry."""
+        if isinstance(key, slice):
+            raise NotImplementedError("Slicing is not supported.")
+        elif isinstance(key, (int, str)):
+            # If key is an integer, return the corresponding environment
+            return self.get_task(key)
+        else:
+            raise TypeError("Key must be an integer or a slice.")
+
+    def add(self, task: BaseTask) -> None:
+        if not isinstance(task, BaseTask):
+            raise TypeError("Only tasks can be added to the registry.")
+        self.tasks.append(task)
diff --git a/langchain_benchmarks/tool_usage/environments/__init__.py b/langchain_benchmarks/tool_usage/environments/__init__.py
diff --git a/...ool_usage/environments/multiverse_math.py → ..._benchmarks/tool_usage/multiverse_math.py b/...ool_usage/environments/multiverse_math.py → ..._benchmarks/tool_usage/multiverse_math.py
@@ -14,7 +14,7 @@
 
 from langchain.tools import tool, BaseTool
 
-from langchain_benchmarks.schema import ToolUsageEnvironment
+from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask
 
 
 def multiply(a: float, b: float) -> float:
@@ -86,3 +86,30 @@ def get_environment() -> ToolUsageEnvironment:
         tools=tools,
         read_state=None,
     )
+
+
+MULTIVERSE_MATH = ToolUsageTask(
+    name="Multiverse Math",
+    dataset_id="placeholder",
+    create_environment=get_environment,
+    instructions=(
+        "You are requested to solve math questions in an alternate "
+        "mathematical universe. The rules of association, commutativity, "
+        "and distributivity still apply, but the operations have been "
+        "altered to yield different results than expected. Solve the "
+        "given math questions using the provided tools. "
+        "Do not guess the answer."
+    ),
+    description=(
+        """\
+An environment that contains a few basic math operations, but with altered results.
+
+For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \
+The basic operations retain some basic properties, such as commutativity, \
+associativity, and distributivity; however, the results are different than expected.
+
+The objective of this task is to evaluate the ability to use the provided tools to \
+solve simple math questions and ignore any innate knowledge about math.
+"""
+    ),
+)
diff --git a/...ool_usage/environments/relational_data.py → ..._benchmarks/tool_usage/relational_data.py b/...ool_usage/environments/relational_data.py → ..._benchmarks/tool_usage/relational_data.py
@@ -12,7 +12,7 @@
 
 from langchain.tools import BaseTool, tool
 
-from langchain_benchmarks.schema import ToolUsageEnvironment
+from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask
 
 USER_DATA = [
     # IDs are not consecutive to prevent agents from guessing the ID
@@ -407,3 +407,34 @@ def get_environment() -> ToolUsageEnvironment:
 
 # ID of a dataset that contains the questions and references
 DATASET_ID = "e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5"  # ID of Agent Gym: E01 Alpha
+
+RELATIONAL_DATA_TASK = ToolUsageTask(
+    name="Tool Usage - Relational Data",
+    dataset_id=DATASET_ID,
+    create_environment=get_environment,
+    instructions=(
+        """\
+Please answer the user's question by using the tools provided. Do not guess the \
+answer. Keep in mind that entities like users,foods and locations have both a \
+name and an ID, which are not the same."""
+    ),
+    description=(
+        """\
+Environment with fake data about users and their locations and favorite foods.
+
+The environment provides a set of tools that can be used to query the data.
+
+The objective of this task is to evaluate the ability to use the provided tools \
+to answer questions about relational data.
+
+The dataset contains 21 examples of varying difficulty. The difficulty is measured \
+by the number of tools that need to be used to answer the question.
+
+Each example is composed of a question, a reference answer, and \
+information about the sequence in which tools should be used to answer \
+the question.
+
+Success is measured by the ability to answer the question correctly, and efficiently.
+"""
+    ),
+)