Skip to content
Merged
2 changes: 1 addition & 1 deletion docs/environments.md
Original file line number Diff line number Diff line change
Expand Up @@ -797,4 +797,4 @@ Newer and more experimental environment classes include:
- **`GymEnv`** — universal runner for Gym-compatible environments (OpenAI Gym / Gymnasium API)
- **`CliAgentEnv`** — runs custom agent code inside sandboxes, intercepting API requests. Accepts sandbox configuration parameters including `docker_image`, `cpu_cores`, `memory_gb`, `disk_size_gb`, `gpu_count`, `timeout_minutes`, `environment_vars`, and `labels` for sandbox categorization
- **`HarborEnv`** — loads Harbor-format agent benchmark tasks
- **`RLMEnv`** — implements Recursive Language Models for unbounded context processing. Execution supports both local and sandbox backends via `execution_backend` (`"local"` default, `"sandbox"` to run the REPL inside a Prime Sandbox). Context is still filesystem-based: a provided `context_dir` is copied into the working directory, or legacy JSON-serializable `context` data is written to `context.json`/`context.txt`. The RLM scaffolding prompt (filesystem availability note, REPL workflow, tool docs) is injected into the first user message wrapped in `<RLM_SCAFFOLDING>...</RLM_SCAFFOLDING>`, preserving any external system prompt; the model-visible prompt is stored in `state["prompt"]`, while the original input prompt is preserved in `state["raw_prompt"]`. The REPL language is configurable via `repl_language` (default: `bash`); use `repl_language="python"` to retain the Python REPL. Bash mode uses `call_bash_repl` and behaves like a terminal; Python mode uses `call_python_repl`. Sub-LLM and root-tool interception for sandboxes is routed through a Prime Tunnel unless `interception_url` is provided. Tooling can be split via `tools` (shared), `root_tools` (REPL-only), and `sub_tools` (sub-LLM tools). Fixed root tools like `llm_batch` are always present and cannot be overridden. Tool ordering is fixed tools → shared tools → role-specific tools, with per-list deduplication by name. Root tools are callable only inside the REPL; sub-LLM tools use standard tool-calling.
- **`RLMEnv`** — implements Recursive Language Models for unbounded context processing. Execution supports both local and sandbox backends via `execution_backend` (`"local"` default, `"sandbox"` to run the REPL inside a Prime Sandbox). Context is still filesystem-based: a provided `context_dir` is copied into the working directory, or legacy JSON-serializable `context` data is written to `context.json`/`context.txt`. The RLM scaffolding prompt (filesystem availability note, REPL workflow, tool docs) is injected into the first user message wrapped in `<RLM_SCAFFOLDING>...</RLM_SCAFFOLDING>`, preserving any external system prompt; the model-visible prompt is stored in `state["prompt"]`, while the original input prompt is preserved in `state["raw_prompt"]`. The REPL language is configurable via `repl_language` (default: `bash`); use `repl_language="python"` to retain the Python REPL. Bash mode uses `call_bash_repl` and behaves like a terminal; Python mode uses `call_python_repl`. Sub-LLM and root-tool interception for sandboxes is routed through a Prime Tunnel unless `interception_url` is provided. Tooling can be split via `tools` (shared), `root_tools` (REPL-only), and `sub_tools` (sub-LLM tools). Fixed root tools like `llm_batch` are always present and cannot be overridden. Tool ordering is fixed tools → shared tools → role-specific tools, with per-list deduplication by name. Root tools are callable only inside the REPL; sub-LLM tools use standard tool-calling. When using the sandbox backend, the sandbox and worker are started eagerly during `setup_state`, and package installs are skipped when the package is already importable in the image.
24 changes: 17 additions & 7 deletions environments/rlm_secrets/rlm_secrets.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import random
import shutil
import string
import tempfile
from pathlib import Path
from typing import Any

Expand Down Expand Up @@ -284,20 +285,29 @@ async def setup_state(self, state: State) -> State:
"""Setup puzzle files in the filesystem."""
# Extract puzzle from info and store directly in state for easy access
info = state.get("info", {})
if not isinstance(info, dict):
info = {}
puzzle = info.get("puzzle", {})
state["puzzle"] = puzzle

# Let RLMEnv do its setup (creates fs_root, starts worker, etc.)
state = await super().setup_state(state)

# Write puzzle files to the filesystem
fs_root = state.get("rlm_fs_root")
if fs_root and puzzle:
temp_dir: str | None = None
if puzzle:
temp_dir = tempfile.mkdtemp(prefix="rlm_secrets_")
for filename, content in zip(
puzzle.get("filenames", []), puzzle.get("contents", [])
):
filepath = Path(fs_root) / filename
filepath = Path(temp_dir) / filename
filepath.write_text(content, encoding="utf-8")
info = dict(info)
info["context_dir"] = temp_dir
state["info"] = info

try:
# Let RLMEnv do its setup (creates fs_root, starts worker, etc.)
state = await super().setup_state(state)
finally:
if temp_dir:
shutil.rmtree(temp_dir, True)

return state

Expand Down
4 changes: 4 additions & 0 deletions verifiers/envs/experimental/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ Environment for running custom agent code inside sandboxes. Intercepts the agent

Environment implementing [Recursive Language Models](https://alexzhang13.github.io/blog/2025/rlm/) (RLMs), an inference strategy where language models can decompose and recursively interact with input context of unbounded length through REPL environments. The root model interacts with a REPL (`repl_language="bash"` by default, or `repl_language="python"` for the Python REPL) and can spawn sub-LLM calls to process chunks of the context recursively. Execution supports both local and sandbox backends via `execution_backend` (`"local"` default, `"sandbox"` to run inside a Prime Sandbox). Extra context is still provided as a filesystem (either a copied `context_dir` or JSON-serializable `context` written to `context.json`/`context.txt`). The RLM scaffolding prompt is injected into the first user message; the model-visible prompt is stored in `state["prompt"]`, while the original input prompt is preserved in `state["raw_prompt"]`. Sandbox interception for sub-LLM/root-tool calls is routed through a Prime Tunnel unless `interception_url` is provided.

Notes:
- When using the sandbox backend, the sandbox and worker are started eagerly during `setup_state`.
- Package installation in sandboxes is best-effort: packages are only installed if they are not importable, which avoids unnecessary installs on images that already include them.

Tool split:

- `tools`: shared between root and sub-LLMs
Expand Down
184 changes: 116 additions & 68 deletions verifiers/envs/experimental/rlm_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import os
import pickle
import random
import re
import shutil
import signal
import shlex
Expand Down Expand Up @@ -2088,8 +2089,35 @@ async def _install_packages(self, session: SandboxRLMReplSession) -> None:
packages.extend(extras)
if not packages:
return
pkg_list = " ".join(packages)
cmd = f"bash -lc 'pip install -q {pkg_list}'"
# Check each package with a quick import and only
# install the ones that are missing. This avoids failures when pip is
# unavailable on PATH but the package is already present in the image.
# For example, in mini-swe-agent-plus-rlm
missing: list[str] = []
for pkg in packages:
name = pkg.strip()
name = name.split("@", 1)[0].strip()
name = name.split("[", 1)[0].strip()
# Strip version constraints (e.g., "numpy>1.20,<2.0") at the first specifier.
name = re.split(r"[<>=!~]", name, 1)[0].strip()
module = name.replace("-", "_")
check_cmd = f"bash -lc 'python -c \"import {module}\"'"
try:
result = await self._execute_sandbox_command(
sandbox_id,
check_cmd,
timeout=self.env.max_startup_wait_seconds,
)
except Exception:
missing.append(pkg)
continue
exit_code = getattr(result, "exit_code", 0)
if exit_code not in (0, None):
missing.append(pkg)
if not missing:
return
pkg_list = " ".join(missing)
cmd = f"bash -lc 'python -m pip install -q {pkg_list}'"
result = await self._execute_sandbox_command(
sandbox_id,
cmd,
Expand Down Expand Up @@ -3617,82 +3645,102 @@ async def setup_state(self, state: State, **kwargs) -> State:
"include_sub_llm_in_trajectory=True. Use branched rollouts instead."
)

# 1. Setup interception and register rollout
state = await self._setup_interception_and_register(state, rollout_id)

# 2. Create rollout directories
self._executor.create_rollout_dirs(state)

# 3. Build filesystem context
info = state.get("info") or {}
if not isinstance(info, dict):
info = {}
fs_root = state.get("rlm_fs_root")
if not fs_root:
raise ValueError("RLM filesystem root not initialized")
fs_has_data = False
fs_source: str | None = None

context_dir = info.get(self.context_dir_key)
if context_dir:
fs_source = str(context_dir)
self._copy_context_directory(fs_source, fs_root)
fs_has_data = True
else:
context_data = info.get(self.context_key, None)
if context_data is not None:
try:
# 1. Setup interception and register rollout
state = await self._setup_interception_and_register(state, rollout_id)

# 2. Create rollout directories
self._executor.create_rollout_dirs(state)

# 3. Build filesystem context
info = state.get("info") or {}
if not isinstance(info, dict):
info = {}
fs_root = state.get("rlm_fs_root")
if not fs_root:
raise ValueError("RLM filesystem root not initialized")
fs_has_data = False
fs_source: str | None = None

context_dir = info.get(self.context_dir_key)
if context_dir:
fs_source = str(context_dir)
self._copy_context_directory(fs_source, fs_root)
fs_has_data = True
self._write_builtin_context(context_data, fs_root)

state["rlm_fs_root"] = fs_root
state["rlm_fs_source"] = fs_source
state["rlm_fs_has_data"] = fs_has_data
state["retain_filesystem_after_rollout"] = self.retain_filesystem_after_rollout
if self.custom_system_prompt:
base_system_prompt = self.custom_system_prompt
elif self.repl_language == "bash":
base_system_prompt = _RLM_BASH_SYSTEM_PROMPT_STORE[
self.root_prompt_verbosity
else:
context_data = info.get(self.context_key, None)
if context_data is not None:
fs_has_data = True
self._write_builtin_context(context_data, fs_root)

state["rlm_fs_root"] = fs_root
state["rlm_fs_source"] = fs_source
state["rlm_fs_has_data"] = fs_has_data
state["retain_filesystem_after_rollout"] = (
self.retain_filesystem_after_rollout
)
if self.custom_system_prompt:
base_system_prompt = self.custom_system_prompt
elif self.repl_language == "bash":
base_system_prompt = _RLM_BASH_SYSTEM_PROMPT_STORE[
self.root_prompt_verbosity
]
else:
base_system_prompt = _RLM_PYTHON_SYSTEM_PROMPT_STORE[
self.root_prompt_verbosity
]

packages_docs = self._generate_packages_documentation()
root_tools_docs = self._generate_root_tools_documentation()
sub_tools_docs = self._generate_sub_tools_documentation()
state["rlm_system_prompt"] = (
base_system_prompt + packages_docs + root_tools_docs + sub_tools_docs
)
state["rlm_packages_docs"] = packages_docs
state["rlm_root_tools_docs"] = root_tools_docs
state["rlm_sub_tools_docs"] = sub_tools_docs
deduped_shared, _ = _dedupe_tools(
self.shared_tools, context="shared tools", reserved_names=set()
)
state["rlm_shared_tools"] = [
_tool_display_name(tool) for tool in deduped_shared
]
else:
base_system_prompt = _RLM_PYTHON_SYSTEM_PROMPT_STORE[
self.root_prompt_verbosity
state["rlm_root_tools"] = [
_tool_display_name(tool) for tool in self.root_tools
]
state["rlm_sub_tools"] = [
_tool_display_name(tool) for tool in self.sub_tools
]

packages_docs = self._generate_packages_documentation()
root_tools_docs = self._generate_root_tools_documentation()
sub_tools_docs = self._generate_sub_tools_documentation()
state["rlm_system_prompt"] = (
base_system_prompt + packages_docs + root_tools_docs + sub_tools_docs
)
state["rlm_packages_docs"] = packages_docs
state["rlm_root_tools_docs"] = root_tools_docs
state["rlm_sub_tools_docs"] = sub_tools_docs
deduped_shared, _ = _dedupe_tools(
self.shared_tools, context="shared tools", reserved_names=set()
)
state["rlm_shared_tools"] = [
_tool_display_name(tool) for tool in deduped_shared
]
state["rlm_root_tools"] = [_tool_display_name(tool) for tool in self.root_tools]
state["rlm_sub_tools"] = [_tool_display_name(tool) for tool in self.sub_tools]

# 4. Prepare backend and start worker (defer for sandbox to allow env setup)
if self.execution_backend != "sandbox":
# 4. Prepare backend and start worker (always eager)
await self._executor.prepare_filesystem(state)
await self._executor.setup(state)
state["rlm_worker_ready"] = True
else:
state["rlm_worker_ready"] = False

# Initialize context warning flag (feature enabled if max_seq_len is set)
state["context_warning_sent"] = False
# Initialize context warning flag (feature enabled if max_seq_len is set)
state["context_warning_sent"] = False

# Initialize FIFO sequence counter for detecting stale responses
state["_exec_seq"] = 0
# Initialize FIFO sequence counter for detecting stale responses
state["_exec_seq"] = 0

_ensure_rlm_metric_state(state)
_ensure_rlm_metric_state(state)

return state
return state
except Exception:
# Best-effort cleanup to avoid leaking tunnels/sandboxes on setup failure.
if rollout_id in self.active_rollouts:
del self.active_rollouts[rollout_id]
try:
await self._executor.cleanup(state)
except Exception:
logger.exception("Failed to cleanup RLM executor after setup error")
if not self.active_rollouts:
try:
await self._teardown_interception_server()
finally:
if self.execution_backend == "sandbox":
await self._teardown_tunnel()
raise

# =========================================================================
# Code Execution
Expand Down
Loading