Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions tests/test_rlm_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pickle
import shutil
from pathlib import Path
from typing import Any
from unittest.mock import AsyncMock, MagicMock, patch

import pytest
Expand Down Expand Up @@ -394,6 +395,109 @@ async def test_bash_prompt_mentions_env_vars(self, rlm_env_bash):
await env.cleanup_rlm_state(result)


class TestPromptVerbosity:
@pytest.mark.asyncio
@pytest.mark.parametrize(
"verbosity, expected_snippets, unexpected_snippets",
[
(
"light",
[
"You have the `call_python_repl` tool and a filesystem available to you."
],
[
"This is an iterative environment.",
"Critical: This is an ITERATIVE environment",
],
),
(
"medium",
[
"You have the `call_python_repl` tool and a filesystem available to you.",
"This is an iterative environment.",
],
["Critical: This is an ITERATIVE environment"],
),
(
"heavy",
[
"iterative Python REPL where you explore data step by step.",
"Critical: This is an ITERATIVE environment",
],
["This is an iterative environment."],
),
],
)
async def test_root_prompt_verbosity_python(
self,
verbosity: str,
expected_snippets: list[str],
unexpected_snippets: list[str],
):
dataset = make_dataset({})
env = build_env(
dataset, repl_language="python", root_prompt_verbosity=verbosity
)
env._ensure_interception_server = AsyncMock()
env._executor.setup = AsyncMock()

state = {"info": {}, "model": "m", "client": MagicMock()}
result = await env.setup_state(state)
try:
prompt = result["rlm_system_prompt"]
for snippet in expected_snippets:
assert snippet in prompt
for snippet in unexpected_snippets:
assert snippet not in prompt
finally:
await env.cleanup_rlm_state(result)

@pytest.mark.asyncio
@pytest.mark.parametrize("verbosity", ["light", "medium", "heavy"])
async def test_sub_prompt_verbosity(self, verbosity: str, rlm_env: RLMEnv):
env = rlm_env
env.sub_prompt_verbosity = verbosity
env.sub_tool_max_turns = 7

captured: dict[str, Any] = {}

async def _fake_run_sub_llm(state, client, model, messages):
captured["messages"] = messages
return {
"final_content": "ok",
"turns": [
{
"prompt_messages": [{"role": "user", "content": "hi"}],
"response": {},
"tool_call_count": 0,
}
],
"total_prompt_tokens": 0,
"total_completion_tokens": 0,
"tool_call_count": 0,
"num_turns": 1,
"max_turns_reached": False,
}

env._run_sub_llm = AsyncMock(side_effect=_fake_run_sub_llm)

await env._run_sub_llm_request(
state_ref={},
client=MagicMock(),
sub_model="m",
messages=[{"role": "user", "content": "task"}],
batch_id="b",
request_id="r",
parent_turn=0,
)

expected = rlm_module._SUB_LLM_SYSTEM_PROMPT_STORE[verbosity].format(
num_turns=env.sub_tool_max_turns
)
assert captured["messages"][0]["role"] == "system"
assert captured["messages"][0]["content"] == expected


class TestBashReplOutput:
@pytest.mark.asyncio
async def test_bash_output_is_raw(self, rlm_env_bash):
Expand Down
112 changes: 98 additions & 14 deletions verifiers/envs/experimental/rlm_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from dataclasses import dataclass
from pathlib import Path
from time import perf_counter
from typing import Any, Callable, cast
from typing import Any, Callable, cast, Literal

if sys.version_info < (3, 12):
from typing_extensions import TypedDict
Expand Down Expand Up @@ -1374,14 +1374,49 @@ def _render_worker_script(


# System prompt for sub-LLMs (called via llm_batch)
_SUB_LLM_SYSTEM_PROMPT = """You are a sub-agent being called by a parent model to help with a specific task.
Answer the query directly and concisely. Put your final answer inside \\boxed{}.

Example: If asked "What is 2+2?", respond with reasoning then \\boxed{4}."""
_SUB_LLM_SYSTEM_PROMPT_STORE = {
"light": ("You have {num_turns} turns available to fulfill your task."),
"medium": (
"You will be given a task to perform."
" Consider the tools at your disposal closely,"
" and don't be afraid to think as much as you need about every step."
"\n\nYou have {num_turns} turns available to fulfill your task."
" You will be warned when there's only one turn left."
),
"heavy": (
"You will be given a task to perform."
" Consider the tools at your disposal closely,"
" and don't be afraid to think as much as you need about every step."
"\n\nYou have {num_turns} turns available to fulfill your task."
" Unless the task is trivial, use the turns to their fullest to make sure you get the answer right."
" Plan well for how to fulfill the task within the turn limit, but don't be afraid to experiment;"
" there's a tradeoff to be had and you should think very carefully about how to optimize it."
" You will be warned when there's only one turn left."
),
}


# System prompt for RLM
_RLM_SYSTEM_PROMPT = """You are operating in a Recursive Language Model (RLM) environment - an iterative Python REPL where you explore data step by step.
_RLM_PYTHON_SYSTEM_PROMPT_STORE = {
"light": """You have the `call_python_repl` tool and a filesystem available to you.

filesystem info:

{filesystem_summary}

There exists an `answer` variable, which is a dict. `answer["content"]` must contain your answer. When the final answer is set, set `answer["ready"] = True`.
""",
"medium": """You have the `call_python_repl` tool and a filesystem available to you.

filesystem info:

{filesystem_summary}

There exists an `answer` variable, which is a dict. `answer["content"]` must contain your answer. When the final answer is set, set `answer["ready"] = True`.

This is an iterative environment. Make use of sub-LLMs via `llm_batch` whenever they could be useful; prefer calling them in parallel to calling them sequentially.
""",
"heavy": """You are operating in a Recursive Language Model (RLM) environment - an iterative Python REPL where you explore data step by step.

## Critical: This is an ITERATIVE environment

Expand Down Expand Up @@ -1419,10 +1454,30 @@ def _render_worker_script(
2. **One step at a time** - make small tool calls, see output, then continue
3. **Use `llm_batch()` for semantic tasks** - summarization, understanding text, classification, etc.
Pass a list of strings only (no message dicts).
"""
""",
}


_RLM_BASH_SYSTEM_PROMPT_STORE = {
"light": """You have the `call_bash_repl` tool and a filesystem available to you.

filesystem info:

{filesystem_summary}

In the end, the `RLM_CONTENT` environment variable must contain your answer. When the final answer is set, call `export RLM_READY=1`.
""",
"medium": """You have the `call_bash_repl` tool and a filesystem available to you.

filesystem info:

{filesystem_summary}

In the end, the `RLM_CONTENT` environment variable must contain your answer. When the final answer is set, call `export RLM_READY=1`.

_RLM_BASH_SYSTEM_PROMPT = """You are operating in a Recursive Language Model (RLM) environment - an iterative Bash REPL where you explore data step by step.
This is an iterative environment. Make use of sub-LLMs via `llm_batch` whenever they could be useful; prefer calling them in parallel to calling them sequentially.
""",
"heavy": """You are operating in a Recursive Language Model (RLM) environment - an iterative Bash REPL where you explore data step by step.

## Critical: This is an ITERATIVE environment

Expand Down Expand Up @@ -1464,7 +1519,8 @@ def _render_worker_script(
- For structured args/kwargs, use `--json` with a payload like `{"args":[...],"kwargs":{...}}`
(or provide the JSON via stdin).
- `llm_batch` accepts `--json` with `{"prompts":[...]}`
"""
""",
}


class BaseRLMExecutor:
Expand Down Expand Up @@ -2370,6 +2426,8 @@ class RLMEnv(vf.StatefulToolEnv):
share a name within a list, initialization raises an error.
sub_tool_max_turns: Maximum tool-calling turns for sub-LLM calls (default: 5)
sub_model: Model to use for sub-LLM calls (defaults to same as root model)
sub_prompt_verbosity: The verbosity of the sub-LLMs' system prompt; "light", "medium", or "high"
root_prompt_verbosity: The verbosity of the root-LLM's system prompt; "light", "medium", or "high"
max_iterations: Maximum REPL iterations before stopping (maps to max_turns)
max_output_length: Maximum length of code execution output
max_sub_llm_parallelism: Maximum number of concurrent sub-LLM calls
Expand Down Expand Up @@ -2427,6 +2485,8 @@ def __init__(
sub_tools: list[Callable] | None = None,
sub_tool_max_turns: int = 5,
sub_model: str | None = None,
sub_prompt_verbosity: Literal["light", "medium", "heavy"] = "light",
root_prompt_verbosity: Literal["light", "medium", "heavy"] = "light",
max_iterations: int = 50,
max_output_length: int = 8192,
max_sub_llm_parallelism: int = 5,
Expand All @@ -2435,8 +2495,8 @@ def __init__(
context_key: str = "context",
context_dir_key: str = "context_dir",
system_prompt: str | None = None,
repl_language: str = "bash",
execution_backend: str = "local",
repl_language: Literal["bash", "python"] = "bash",
execution_backend: Literal["local", "sandbox"] = "local",
interception_host: str | None = None,
interception_port: int = 8766,
interception_url: str | None = None,
Expand Down Expand Up @@ -2473,6 +2533,18 @@ def __init__(
raise ValueError(
f"Unsupported execution_backend '{execution_backend}'. Expected 'local' or 'sandbox'."
)
if sub_prompt_verbosity not in {"light", "medium", "heavy"}:
raise ValueError(
f"Unsupported sub_prompt_verbosity '{sub_prompt_verbosity}. "
"Expected 'light', 'medium', or 'high'"
)
if root_prompt_verbosity not in {"light", "medium", "heavy"}:
raise ValueError(
f"Unsupported root_prompt_verbosity '{root_prompt_verbosity}. "
"Expected 'light', 'medium', or 'high'"
)
self.sub_prompt_verbosity = sub_prompt_verbosity
self.root_prompt_verbosity = root_prompt_verbosity
self.repl_language = repl_language
self.execution_backend = execution_backend
self.sub_model = sub_model
Expand Down Expand Up @@ -3241,7 +3313,15 @@ async def _run_sub_llm_request(
elapsed_seconds: float | None = None,
) -> dict[str, Any]:
messages_with_system: ChatMessages = [
cast(ChatMessage, {"role": "system", "content": _SUB_LLM_SYSTEM_PROMPT}),
cast(
ChatMessage,
{
"role": "system",
"content": _SUB_LLM_SYSTEM_PROMPT_STORE[
self.sub_prompt_verbosity
].format(num_turns=self.sub_tool_max_turns),
},
),
*messages,
]

Expand Down Expand Up @@ -3626,9 +3706,13 @@ async def setup_state(self, state: State, **kwargs) -> State:
if self.custom_system_prompt:
base_system_prompt = self.custom_system_prompt
elif self.repl_language == "bash":
base_system_prompt = _RLM_BASH_SYSTEM_PROMPT
base_system_prompt = _RLM_BASH_SYSTEM_PROMPT_STORE[
self.root_prompt_verbosity
]
else:
base_system_prompt = _RLM_SYSTEM_PROMPT
base_system_prompt = _RLM_PYTHON_SYSTEM_PROMPT_STORE[
self.root_prompt_verbosity
]
if "{filesystem_summary}" in base_system_prompt:
# Use replace instead of format to avoid conflict with curly braces from Python code
base_system_prompt = base_system_prompt.replace(
Expand Down
Loading