Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 52 additions & 36 deletions verifiers/envs/experimental/rlm_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,9 +425,12 @@ class RLMEnv(SandboxEnv):
When False, sub-LLM calls happen but are not stored.
context_warning_threshold: Fraction of max_seq_len at which to warn the model
to finish (default: 0.80). Only active if max_seq_len is set.
code_execution_timeout: Timeout in seconds for code execution (default: 600).
code_execution_timeout: Timeout in seconds for code execution (default: 120).
This is longer than the default command timeout to allow for
llm_batch calls which can take several minutes.
abort_on_code_timeout: If True, abort the rollout when code execution times out.
If False (default), return an error message to the model so it can
try a more efficient approach.
**kwargs: Additional arguments passed to SandboxEnv
"""

Expand Down Expand Up @@ -456,6 +459,7 @@ def __init__(
include_sub_llm_in_trajectory: bool = True,
context_warning_threshold: float = 0.80,
code_execution_timeout: int = 120,
abort_on_code_timeout: bool = False,
rubric: Rubric | None = None,
**kwargs,
):
Expand All @@ -474,6 +478,7 @@ def __init__(
self.include_sub_llm_in_trajectory = include_sub_llm_in_trajectory
self.context_warning_threshold = context_warning_threshold
self.code_execution_timeout = code_execution_timeout
self.abort_on_code_timeout = abort_on_code_timeout
# Server-side timeout for LLM API calls (shorter than sandbox HTTP timeout)
# This ensures server responds before sandbox worker's HTTP request times out
self.sub_llm_api_timeout = max(10, int(self.code_execution_timeout * 0.8))
Expand Down Expand Up @@ -1048,52 +1053,42 @@ async def _start_worker(self, state: State) -> None:
interception_url = state["interception_url"]

sub_llm_timeout = max(10, int(self.code_execution_timeout * 0.9))
# Calculate max wait iterations for worker script (written AFTER pip install completes)
# This can take 30-60+ seconds for heavy packages like numpy, sympy, scipy
script_wait_iterations = max(1, int(self.max_startup_wait_seconds / 0.1))
start_worker_cmd = f"""
export RLM_INTERCEPTION_URL="{interception_url}"
export RLM_SUB_MODEL="{self.sub_model or state.get("model", "")}"
export RLM_MAX_SUB_LLM_PARALLELISM="{self.max_sub_llm_parallelism}"
export RLM_SUB_LLM_TIMEOUT="{sub_llm_timeout}"
export RLM_SANDBOX_TIMEOUT="{self.code_execution_timeout}"

# Sync filesystem and verify worker script exists
# Wait for worker script to exist (written AFTER pip install completes in start_command)
# This can take 30-60+ seconds for heavy packages like numpy, sympy, scipy
sync 2>/dev/null || true
for i in $(seq 1 50); do
for i in $(seq 1 {script_wait_iterations}); do
if [ -f "{self._WORKER_PATH}" ]; then
break
fi
sleep 0.1
done

if [ ! -f "{self._WORKER_PATH}" ]; then
echo "Worker script not found at {self._WORKER_PATH}" >&2
echo "Worker script not found - pip install may have failed or timed out" >&2
exit 1
fi

# Small delay to ensure filesystem is fully synced before reading script
sleep 0.5

# Retry starting the worker up to 3 times
# Use touch to pre-create log file, bash -c for reliable backgrounding,
# and check for content (-s) rather than just existence (-f)
for attempt in 1 2 3; do
rm -f /tmp/rlm_worker.log
touch /tmp/rlm_worker.log
bash -c "nohup python -u {self._WORKER_PATH} >> /tmp/rlm_worker.log 2>&1 &"
sleep 0.5
if [ -s /tmp/rlm_worker.log ]; then
break
fi
echo "Worker start attempt $attempt failed (log empty), retrying..." >&2
sleep 0.5
done
sleep 0.2

if [ ! -s /tmp/rlm_worker.log ]; then
echo "Failed to start worker after 3 attempts (log still empty)" >&2
cat /tmp/rlm_worker.log 2>&1 || echo "Could not read log"
exit 1
fi
# Start the worker
nohup python -u {self._WORKER_PATH} >> /tmp/rlm_worker.log 2>&1 &
"""
await self._execute_command_with_retry(sandbox_id, start_worker_cmd)
# Timeout needs to account for pip install wait + worker startup
start_worker_timeout = self.max_startup_wait_seconds + 30
await self._execute_command_with_retry(
sandbox_id, start_worker_cmd, timeout=start_worker_timeout
)
await self._wait_for_worker_ready(sandbox_id)

async def _prepare_sandbox_and_start_worker(
Expand Down Expand Up @@ -1149,7 +1144,7 @@ async def setup_state(self, state: State, **kwargs) -> State:
state = await super().setup_state(state, **kwargs)
sandbox_id = state.get("sandbox_id")
if not sandbox_id:
raise vf.SandboxError(Exception("Sandbox ID not set"))
raise vf.SandboxError() from Exception("Sandbox ID not set")

rollout_id = f"rlm_{uuid.uuid4().hex[:8]}"
state["rollout_id"] = rollout_id
Expand Down Expand Up @@ -1223,7 +1218,11 @@ async def _wait_for_worker_ready(self, sandbox_id: str) -> None:
wait_script = _make_ready_wait_script(
self._READY_FLAG, self.max_startup_wait_seconds
)
result = await self._execute_command_with_retry(sandbox_id, wait_script)
# Use max_startup_wait_seconds as timeout (+ buffer for script overhead)
timeout = self.max_startup_wait_seconds + 10
result = await self._execute_command_with_retry(
sandbox_id, wait_script, timeout=timeout
)
if "failed to start" in result.stdout or "failed to start" in (
result.stderr or ""
):
Expand All @@ -1235,8 +1234,8 @@ async def _wait_for_worker_ready(self, sandbox_id: str) -> None:
logger.error(
f"RLM worker failed to start. Debug info:\n{debug_result.stdout}"
)
raise vf.SandboxError(
Exception(f"RLM worker failed to start: {debug_result.stdout[:500]}")
raise vf.SandboxError() from Exception(
f"RLM worker failed to start: {debug_result.stdout[:500]}"
)

# =========================================================================
Expand Down Expand Up @@ -1272,14 +1271,31 @@ async def _execute_code(
)

try:
result = await self._execute_command_with_retry(
# No retry for code execution - if code times out or fails, return error to model
result = await self.sandbox_client.execute_command(
sandbox_id, command, timeout=self.code_execution_timeout
)
except vf.SandboxError as e:
# Re-raise sandbox errors (including timeouts) to fail the rollout
error_msg = str(e.__cause__) if e.__cause__ else str(e)
logger.error(f"Sandbox error during code execution: {error_msg}")
raise
except CommandTimeoutError as e:
logger.warning(
f"Code execution timed out after {self.code_execution_timeout}s"
)
if self.abort_on_code_timeout:
# Abort rollout immediately on timeout
raise vf.SandboxError() from e
# Return error to model so it can try more efficient code
return {
"status": "error",
"stdout": "",
"stderr": "",
"result": f"Code execution timed out after {self.code_execution_timeout} seconds. "
"Your code may be too slow - consider a more efficient algorithm or "
"breaking the computation into smaller steps.",
"answer": {"ready": False, "content": ""},
}
except Exception as e:
# Other errors - abort the rollout
logger.error(f"Sandbox error during code execution: {e}")
raise vf.SandboxError() from e

if not result.stdout:
return {
Expand Down
Loading