Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion environments/rlm_secrets/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,16 @@ Both reward functions have equal weight (0.5 each):
| Parameter | Default | Description |
|-----------|---------|-------------|
| `num_train_examples` | 100 | Training puzzles |
| `num_eval_examples` | 20 | Evaluation puzzles |
| `num_files` | 4 | Files per puzzle |
| `max_turns` | 50 | Max REPL iterations |
| `sub_tool_max_turns` | 3 | Max tool turns for sub-LLMs |
| `max_sub_llm_parallelism` | 5 | Concurrent sub-LLM calls |
| `code_execution_timeout` | 120 | Bash execution timeout (seconds) |
| `**kwargs` | - | Passed on `RLMEnv.__init__` |

Note: The eval dataset is not built separately. For evaluation, re-instantiate the
environment with a different `seed` to generate a new synthetic split.

## Why This Environment?

This environment is specifically designed to test RLM capabilities:
Expand Down
11 changes: 3 additions & 8 deletions environments/rlm_secrets/rlm_secrets.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ def build_dataset(
Dataset with prompt, answer, and info columns
"""
rows = []
task_name = "rlm-secrets"

for i in range(num_examples):
puzzle = generate_puzzle(num_files=num_files)
Expand Down Expand Up @@ -359,9 +360,11 @@ def build_dataset(

rows.append(
{
"example_id": i,
"prompt": prompt,
"answer": str(puzzle["correct_position"]),
"info": {"puzzle": puzzle},
"task": task_name,
}
)

Expand Down Expand Up @@ -443,7 +446,6 @@ async def correct_filesystem_state(state: State) -> float:

def load_environment(
num_train_examples: int = 100,
num_eval_examples: int = 20,
num_files: int = 4,
max_turns: int = 50,
seed: int | None = None,
Expand All @@ -458,7 +460,6 @@ def load_environment(

Args:
num_train_examples: Number of training puzzle instances
num_eval_examples: Number of evaluation puzzle instances
num_files: Number of files per puzzle (default: 4)
max_turns: Maximum REPL iterations (default: 50)
seed: Random seed for dataset generation
Expand All @@ -477,19 +478,13 @@ def load_environment(
num_files=num_files,
)

eval_dataset = build_dataset(
num_examples=num_eval_examples,
num_files=num_files,
)

rubric = vf.Rubric(
funcs=[correct_answer, correct_filesystem_state],
weights=[0.5, 0.5],
)

return RLMSecretsEnv(
dataset=train_dataset,
eval_dataset=eval_dataset,
num_files=num_files,
repl_language=repl_language,
rubric=rubric,
Expand Down
154 changes: 122 additions & 32 deletions tests/test_rlm_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -1033,42 +1033,56 @@ async def test_executes_tool_calls(self, rlm_env_with_sub_tools):

class TestSubLLMRequestPaths:
@pytest.mark.asyncio
async def test_interleaved_uses_tokens_endpoint(self, rlm_env):
async def test_sub_llm_ignores_interleaving_and_uses_chat(self, rlm_env):
mock_client = MagicMock()
mock_message = MagicMock()
mock_message.tool_calls = None
mock_message.content = "ok"
mock_response = MagicMock()
mock_client.post = AsyncMock(return_value=mock_response)
mock_client.chat.completions.create = AsyncMock()
mock_response.choices = [MagicMock(message=mock_message)]
mock_client.post = AsyncMock()
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)

rlm_env.interleaved_rollouts = True
messages = [{"role": "user", "content": "hi"}]
state = {"sampling_args": {"max_tokens": 7, "extra_body": {"foo": "bar"}}}

with patch(
"verifiers.envs.experimental.rlm_env.tokenize_vllm",
new=AsyncMock(return_value=[1, 2, 3]),
) as mock_tokenize:
await rlm_env._call_sub_llm_api(state, mock_client, "gpt-4", messages)

mock_tokenize.assert_awaited_once_with(
client=mock_client,
messages=messages,
tools=None,
model="gpt-4",
state = {"sampling_args": {"max_tokens": 7}}

await rlm_env._call_sub_llm_api(state, mock_client, "gpt-4", messages)

mock_client.chat.completions.create.assert_awaited_once()
_, kwargs = mock_client.chat.completions.create.call_args
assert kwargs["max_completion_tokens"] == 7
assert "max_tokens" not in kwargs
mock_client.post.assert_not_called()


# =============================================================================
# 8. llm_batch Prompt Validation
# =============================================================================


class TestLLMBatchPromptValidation:
@pytest.mark.asyncio
async def test_llm_batch_rejects_non_string_prompts(self, rlm_env):
context = {
"client": MagicMock(),
"sub_model": "gpt-4",
"state": {"trajectory": []},
}

contents, _ = await rlm_env._root_llm_batch(
context, [{"role": "user", "content": "hi"}]
)
assert "must be a string" in contents[0]

contents, _ = await rlm_env._root_llm_batch(
context, [[{"role": "user", "content": "hi"}]]
)
mock_client.post.assert_awaited_once()
args, kwargs = mock_client.post.call_args
assert args[0] == "/chat/completions/tokens"
body = kwargs["body"]
assert body["tokens"] == [1, 2, 3]
assert body["max_completion_tokens"] == 7
assert body["return_token_ids"] is True
assert body["foo"] == "bar"
assert "max_tokens" not in body
mock_client.chat.completions.create.assert_not_called()
assert "must be a string" in contents[0]


# =============================================================================
# 8. Root Tool Serialization (pickle)
# 9. Root Tool Serialization (pickle)
# =============================================================================


Expand Down Expand Up @@ -1115,7 +1129,7 @@ def echo_tool(value):


# =============================================================================
# 9. Context Limit Configuration
# 10. Context Limit Configuration
# =============================================================================


Expand All @@ -1130,7 +1144,7 @@ def test_custom_threshold(self):


# =============================================================================
# 10. Sub-LLM Metrics with Tools
# 11. Sub-LLM Metrics with Tools
# =============================================================================


Expand Down Expand Up @@ -1200,18 +1214,94 @@ async def test_accumulates_tokens_across_tool_turns(self, rlm_env_with_sub_tools


# =============================================================================
# 11. Sub-LLM Trajectory Steps
# 12. Sub-LLM Trajectory Steps
# =============================================================================


class TestSubLLMTrajectorySteps:
@pytest.mark.asyncio
async def test_include_sub_llm_in_trajectory_default(self, rlm_env):
assert rlm_env.include_sub_llm_in_trajectory is True
assert rlm_env.include_sub_llm_in_trajectory is False

def test_interleaved_disallowed_when_sub_llm_in_trajectory(self):
dataset = make_dataset({})
with pytest.raises(ValueError, match="include_sub_llm_in_trajectory=True"):
build_env(
dataset,
include_sub_llm_in_trajectory=True,
interleaved_rollouts=True,
)

@pytest.mark.asyncio
async def test_sub_llm_steps_added_to_trajectory(self, rlm_env):
rlm_env.include_sub_llm_in_trajectory = True
state = {"trajectory": [], "sampling_args": {}}

mock_message = MagicMock()
mock_message.tool_calls = None
mock_message.content = "ok"
mock_response = MagicMock()
mock_response.choices = [MagicMock(message=mock_message)]
mock_response.usage = MagicMock(prompt_tokens=1, completion_tokens=1)

result = {
"final_content": "ok",
"turns": [
{
"prompt_messages": [{"role": "user", "content": "hi"}],
"response": mock_response,
"tool_call_count": 0,
}
],
"total_prompt_tokens": 1,
"total_completion_tokens": 1,
"tool_call_count": 0,
"num_turns": 1,
"max_turns_reached": False,
}

token_payload = {
"prompt_ids": [1],
"prompt_mask": [0],
"completion_ids": [2],
"completion_mask": [1],
"completion_logprobs": [0.0],
"overlong_prompt": False,
"is_truncated": False,
}

with (
patch.object(rlm_env, "_run_sub_llm", new=AsyncMock(return_value=result)),
patch(
"verifiers.envs.experimental.rlm_env.parse_response_tokens",
new=AsyncMock(return_value=token_payload),
),
patch(
"verifiers.envs.experimental.rlm_env.parse_response_messages",
new=AsyncMock(return_value=[{"role": "assistant", "content": "ok"}]),
),
patch(
"verifiers.envs.experimental.rlm_env.parse_is_truncated",
new=AsyncMock(return_value=False),
),
):
await rlm_env._run_sub_llm_request(
state_ref=state,
client=MagicMock(),
sub_model="gpt-4",
messages=[{"role": "user", "content": "hi"}],
batch_id="b1",
request_id="r1",
parent_turn=0,
)

assert len(state["trajectory"]) == 1
assert state["trajectory"][0]["trajectory_id"] == "b1_r1"
assert state["trajectory"][0]["extras"]["is_sub_llm_call"] is True


# =============================================================================
# 12. Tunnel Utils (kept for coverage)
# 13. Tunnel Utils (kept for coverage)
# =============================================================================


Expand Down
Loading
Loading