Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions docs/source/notebooks/tool_usage/benchmark_all_tasks.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "1ba9f105-c48f-4d8c-8253-355ef13156b0",
"metadata": {},
"source": [
"# Benchmark All\n",
"\n",
"Here, we'll run benchmarking against all tool usage task.\n",
"\n",
"Expand the models list to benchmark against different models."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ddadb9ef-e76a-4b48-85e4-f62c3957f502",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import datetime\n",
"import uuid\n",
"\n",
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks import clone_public_dataset, registry\n",
"from langchain_benchmarks.tool_usage import agents"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6fbc3ef-7a3f-430f-8b79-45af5861b3ee",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"experiment_uuid = uuid.uuid4().hex[:4]\n",
"models = [\"gpt-3.5-turbo-16k\"]\n",
"client = Client() # Launch langsmith client for cloning datasets\n",
"today = datetime.date.today().isoformat()\n",
"\n",
"for task in registry:\n",
" dataset_name = task.name + f\"_benchmarking_{today}\"\n",
" clone_public_dataset(task.dataset_id, dataset_name=dataset_name)\n",
"\n",
" if task.type != \"ToolUsageTask\":\n",
" continue\n",
" for model in models:\n",
" print()\n",
" print(f\"Benchmarking {task.name} with model: {model}\")\n",
" eval_config = task.get_eval_config()\n",
" agent_factory = agents.OpenAIAgentFactory(task, model=model)\n",
"\n",
" client.run_on_dataset(\n",
" dataset_name=dataset_name,\n",
" llm_or_chain_factory=agent_factory,\n",
" evaluation=eval_config,\n",
" verbose=False,\n",
" project_name=f\"{dataset_name}-{model}-{experiment_uuid}\",\n",
" tags=[model],\n",
" concurrency_level=1,\n",
" project_metadata={\n",
" \"model\": model,\n",
" \"id\": experiment_uuid,\n",
" \"task\": task.name,\n",
" \"date\": today,\n",
" },\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
21 changes: 17 additions & 4 deletions langchain_benchmarks/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def _table(self) -> List[List[str]]:
"""Return a table representation of the environment."""
return [
["Name", self.name],
["Type", self.__class__.__name__],
["Type", self.type],
["Dataset ID", self._dataset_link],
["Description", self.description],
]
Expand All @@ -83,6 +83,11 @@ def _repr_html_(self) -> str:
tablefmt="unsafehtml",
)

@property
def type(self) -> str:
"""Return the type of the task."""
return self.__class__.__name__


@dataclasses.dataclass(frozen=True)
class ToolUsageTask(BaseTask):
Expand Down Expand Up @@ -180,6 +185,14 @@ def __post_init__(self) -> None:
)
seen_names.add(task.name)

def __len__(self) -> int:
"""Return the number of tasks in the registry."""
return len(self.tasks)

def __iter__(self) -> Iterable[BaseTask]:
"""Iterate over the tasks in the registry."""
return iter(self.tasks)

def _repr_html_(self) -> str:
"""Return an HTML representation of the registry."""
headers = [
Expand Down Expand Up @@ -222,10 +235,10 @@ def filter(
]
return Registry(tasks=tasks)

def __getitem__(self, key: Union[int, str]) -> BaseTask:
def __getitem__(self, key: Union[int, str, slice]) -> Union[BaseTask, Registry]:
"""Get an environment from the registry."""
if isinstance(key, slice):
raise NotImplementedError("Slicing is not supported.")
return Registry(tasks=self.tasks[key])
elif isinstance(key, (int, str)):
# If key is an integer, return the corresponding environment
return self.get_task(key)
Expand Down Expand Up @@ -438,7 +451,7 @@ def __iter__(self) -> Iterable[RegisteredModel]:
return iter(self.registered_models)

def __getitem__(
self, key: Union[int, str]
self, key: Union[int, str, slice]
) -> Union[RegisteredModel, ModelRegistry]:
"""Get an environment from the registry."""
if isinstance(key, slice):
Expand Down