Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions src/agentlab/agents/tapeagent/conf/agent/plan_act.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ known_actions:
- _target_: hydra.utils.get_class
path: tapeagents.tools.code_executor.PythonCodeAction
- _target_: hydra.utils.get_class
path: tapeagents.tools.browser.ClickAction
path: tapeagents.tools.browser.ClickBIDAction
- _target_: hydra.utils.get_class
path: tapeagents.tools.browser.GoBackAction
- _target_: hydra.utils.get_class
Expand Down Expand Up @@ -82,13 +82,12 @@ nodes:
- _target_: tapeagents.nodes.StandardNode
name: act
system_prompt: ${agent.templates.system_prompt}
guidance: |
Produce single next step. If the answer is ready, produce gaia_answer_action.
${agent.templates.format}
guidance: Then produce single function call for the next step. If the answer is ready, call GaiaAnswer.
steps_prompt: ${agent.templates.allowed_steps}
steps:
- tapeagents.steps.ReasoningThought
- agentlab.benchmarks.gaia.ExtractedFacts
- agentlab.benchmarks.gaia.GaiaAnswer
use_known_actions: true
use_function_calls: true
next_node: act
7 changes: 4 additions & 3 deletions src/agentlab/agents/tapeagent/conf/gaia_l1.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
defaults:
- llm: o4mini
- llm: apriel
- agent: plan_act
- environment: web_code
- _self_

name: gaia_agent
name: Apriel1p5
comment: Gaia L1 val
split: validation
level: "1"
max_turns: 20
parallel_backend: ray
n_jobs: 10
n_jobs: 20
7 changes: 4 additions & 3 deletions src/agentlab/agents/tapeagent/conf/gaia_val.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
defaults:
- llm: gpt4o_mini
- llm: apriel
- agent: plan_act
- environment: web_code
- _self_

name: gaia_agent
name: Apriel1p5
comment: Gaia val
split: validation
level: "all"
max_turns: 20
parallel_backend: ray
n_jobs: 10
n_jobs: 20
9 changes: 9 additions & 0 deletions src/agentlab/agents/tapeagent/conf/llm/apriel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
_target_: tapeagents.llms.TrainableLLM
model_name: Apriel1p5
tokenizer_name: ServiceNow-AI/Apriel-1.5-15b-Thinker
stream: false
use_cache: false
context_size: 128000
base_url: localhost:8000
parameters:
temperature: 0.6
19 changes: 19 additions & 0 deletions src/agentlab/agents/tapeagent/experiments/gaia.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
## Setup instructions
- you need podman installed to run code execution and a serper.dev api key to use web search
- to install and configure podman on a mac use provided script `src/agentlab/agents/tapeagent/experiments/setup_gaia.sh`
- after the podman machine up and running set DOCKER_HOST env var to its socket: `export DOCKER_HOST=http+unix://$(podman machine inspect --format '{{.ConnectionInfo.PodmanSocket.Path}}')`
- set the env var with the serper dev api key: `export SERPER_API_KEY=your_key`
- set the env var with the url to the inference endpoint: `export LLM_BASE_URL=your_enpoint_url`

## Experiment configs:
- main config: `src/agentlab/agents/tapeagent/conf/gaia_l1.yaml` for L1 subset, `src/agentlab/agents/tapeagent/conf/gaia_val.yaml` for full validation set
- llm configs are in `src/agentlab/agents/tapeagent/conf/llm`. Feel free to add your own
- recommended agent architecture to use is `src/agentlab/agents/tapeagent/conf/agent/plan_act.yaml`. It is already used in the main configs mentioned above.
- env config that describes available tools: `src/agentlab/agents/tapeagent/conf/environment/web_code.yaml`

## Running evaluation:
- to run in debug mode without parallelism: `AGENTLAB_DEBUG=1 python src/agentlab/agents/tapeagent/experiments/run_gaia.py`
- to run quick parallel eval: `python src/agentlab/agents/tapeagent/experiments/run_gaia.py`
- you can adjust content of the entrypoint script `src/agentlab/agents/tapeagent/experiments/run_gaia.py` to change config name.
- when parallel eval is running, Ray dahsboard with progress is available at `http://127.0.0.1:8265/#/jobs/01000000`
- experiment results will be written in subfolder of '~/agentlab_results/` with the name including current datetime, agent name and benchmark name
1 change: 1 addition & 0 deletions src/agentlab/agents/tapeagent/experiments/run_gaia.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

if __name__ == "__main__":
config = load_config("gaia_l1")
config.llm.base_url = os.environ["LLM_BASE_URL"]
study = make_study(
benchmark=GaiaBenchmark.from_config(config), # type: ignore
agent_args=TapeAgentArgs(agent_name=config.name, config=config),
Expand Down
35 changes: 17 additions & 18 deletions src/agentlab/benchmarks/gaia.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class GaiaGym(MultiToolGym):
task: dict
exp_dir: str

def __init__(self, tools: list[Tool | StatefulTool], task: dict, exp_dir: str):
super().__init__(tools=tools)
def __init__(self, tools: list[Tool | StatefulTool], task: dict, exp_dir: str, max_turns: int):
super().__init__(tools=tools, max_turns=max_turns)
self.task = task
self.exp_dir = exp_dir
os.makedirs(".cache", exist_ok=True)
Expand Down Expand Up @@ -67,20 +67,9 @@ class GaiaGymArgs(AbstractEnvArgs):
task_seed: int
task_name: str
env_config: DictConfig
max_turns: int

def __init__(
self,
task_name: str,
task: dict[str, Any],
env_config: DictConfig,
task_seed: int = 0,
):
self.task_name = task_name
self.task = task
self.task_seed = task_seed
self.env_config = env_config

def make_env(self, exp_dir: Path, action_mapping=None) -> GaiaGym:
def make_env(self, exp_dir: Path, action_mapping=None, **kwargs) -> GaiaGym:
tapeagents.config.DB_DEFAULT_FILENAME = str(exp_dir.parent / "tapedata.sqlite")
exp_dir_str = str(exp_dir)
logger.info(f"Init gaia env with directory {exp_dir_str}")
Expand All @@ -89,7 +78,7 @@ def make_env(self, exp_dir: Path, action_mapping=None) -> GaiaGym:
if hasattr(self.env_config.tools[i], "exp_path"):
self.env_config.tools[i].exp_path = exp_dir_str
tools = hydra.utils.instantiate(self.env_config.tools)
env = GaiaGym(tools=tools, task=self.task, exp_dir=exp_dir_str)
env = GaiaGym(tools=tools, task=self.task, exp_dir=exp_dir_str, max_turns=self.max_turns)
return env


Expand Down Expand Up @@ -122,6 +111,7 @@ class GaiaBenchmark(AbstractBenchmark):
model_config = ConfigDict(arbitrary_types_allowed=True)
name: str = "gaia"
split: Literal["test", "validation"]
max_turns: int = 20
level: Literal["1", "2", "3", "all"] = "all"
env_args_list: list[GaiaGymArgs] = None # type: ignore
dataset: dict | None = None # type: ignore
Expand All @@ -134,6 +124,7 @@ def from_config(cls, config: DictConfig, dataset: dict | None = None) -> Self:
level=config.level,
env_config=config.environment,
dataset=dataset,
max_turns=config.max_turns,
)

def model_post_init(self, __context: Any) -> None:
Expand All @@ -151,7 +142,14 @@ def model_post_init(self, __context: Any) -> None:
number += 1
task["number"] = number
name = f"gaia.{task['task_id']}"
env_args = GaiaGymArgs(task_name=name, task=task, env_config=self.env_config)
task_seed = 0
env_args = GaiaGymArgs(
task_name=name,
task=task,
task_seed=task_seed,
env_config=self.env_config,
max_turns=self.max_turns,
)
self.env_args_list.append(env_args)
logger.info(f"Loaded {len(self.env_args_list)} tasks from {self.split} split")

Expand Down Expand Up @@ -192,7 +190,8 @@ def task_to_observations(task: dict, max_doc_length: int = 8000) -> list[Observa
if not question.filename:
return [question]

filename: str | None = question.filename
filename: str = question.filename
assert os.path.exists(filename), f"Attachment {filename} does not exist!"
question.filename = None
steps: list[Observation] = []
name, ext = filename.rsplit(".", maxsplit=1)
Expand Down
Loading