Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/release/TRAJECTORIES.md
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,8 @@ async def add_model_response(
tokens=tokens,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id=state["current_trajectory_id"],
extras={},
)
state["trajectory"].append(trajectory_step)
Expand Down
2 changes: 2 additions & 0 deletions tests/test_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ async def rollout(
tokens=tokens,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id=state["trajectory_id"],
extras={},
)
state["trajectory"].append(trajectory_step)
Expand Down
2 changes: 2 additions & 0 deletions tests/test_environment_extra.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ async def rollout(
tokens=tokens,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id=state["trajectory_id"],
extras={},
)
state["trajectory"].append(trajectory_step)
Expand Down
10 changes: 10 additions & 0 deletions tests/test_rlm_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -1189,6 +1189,8 @@ async def test_prepends_trajectory_steps_during_cleanup(self, rlm_env):
tokens=None,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id="sub_batch1_req1",
extras={"is_sub_llm_call": True, "timestamp": 1.0},
)
sub_step2 = TrajectoryStep(
Expand All @@ -1198,6 +1200,8 @@ async def test_prepends_trajectory_steps_during_cleanup(self, rlm_env):
tokens=None,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id="sub_batch1_req2",
extras={"is_sub_llm_call": True, "timestamp": 2.0},
)
rlm_env.active_rollouts[rollout_id] = {
Expand All @@ -1213,6 +1217,8 @@ async def test_prepends_trajectory_steps_during_cleanup(self, rlm_env):
tokens=None,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id="main_trajectory",
extras={},
)
state = {"rollout_id": rollout_id, "trajectory": [main_step]}
Expand Down Expand Up @@ -1251,6 +1257,8 @@ async def test_no_prepend_when_disabled(self, mock_sandbox_client, mock_dataset)
tokens=None,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id="sub_batch1_req1",
extras={"is_sub_llm_call": True, "timestamp": 1.0},
)
env.active_rollouts[rollout_id] = {
Expand All @@ -1265,6 +1273,8 @@ async def test_no_prepend_when_disabled(self, mock_sandbox_client, mock_dataset)
tokens=None,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id="main_trajectory",
extras={},
)
state = {"rollout_id": rollout_id, "trajectory": [main_step]}
Expand Down
8 changes: 8 additions & 0 deletions tests/test_singleturn_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ async def test_is_completed_method(self, mock_singleturn_env):
tokens=None,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id="test_trajectory",
extras={},
)
],
Expand Down Expand Up @@ -487,6 +489,8 @@ async def test_singleturn_stops_after_one_response(
tokens=None,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id="test_trajectory",
extras={},
)
]
Expand Down Expand Up @@ -514,6 +518,8 @@ async def test_singleturn_stops_after_one_response(
tokens=None,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id="test_trajectory",
extras={},
),
TrajectoryStep(
Expand All @@ -523,6 +529,8 @@ async def test_singleturn_stops_after_one_response(
tokens=None,
reward=None,
advantage=None,
is_truncated=False,
trajectory_id="test_trajectory",
extras={},
),
]
Expand Down
14 changes: 14 additions & 0 deletions tests/test_trajectory_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,13 @@ def test_process_trajectory_steps_for_training():
completion_ids=[3, 4],
completion_mask=[1, 1],
completion_logprobs=[-0.1, -0.2],
overlong_prompt=False,
is_truncated=False,
),
reward=1.0,
advantage=None,
is_truncated=False,
trajectory_id="test_trajectory",
extras={},
)
]
Expand All @@ -135,9 +139,13 @@ def test_process_trajectory_steps_for_training():
completion_ids=[6, 7, 8],
completion_mask=[1, 1, 1],
completion_logprobs=[-0.3, -0.4, -0.5],
overlong_prompt=False,
is_truncated=False,
),
reward=0.5,
advantage=None,
is_truncated=False,
trajectory_id="test_trajectory",
extras={},
)
]
Expand Down Expand Up @@ -192,6 +200,8 @@ def test_process_trajectory_steps_skip_missing_tokens():
tokens=None,
reward=1.0,
advantage=None,
is_truncated=False,
trajectory_id="test_trajectory",
extras={},
),
TrajectoryStep(
Expand All @@ -204,9 +214,13 @@ def test_process_trajectory_steps_skip_missing_tokens():
completion_ids=[2, 3],
completion_mask=[1, 1],
completion_logprobs=[-0.1, -0.2],
overlong_prompt=False,
is_truncated=False,
),
reward=0.5,
advantage=None,
is_truncated=False,
trajectory_id="test_trajectory",
extras={},
),
]
Expand Down
2 changes: 2 additions & 0 deletions verifiers/envs/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
import signal
import time
import uuid
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor
from copy import deepcopy
Expand Down Expand Up @@ -597,6 +598,7 @@ async def init_state(
else:
state["oai_tools"] = []
state["trajectory"] = []
state["trajectory_id"] = uuid.uuid4().hex
state["reward"] = None
state["metrics"] = None
state["error"] = None
Expand Down
1 change: 1 addition & 0 deletions verifiers/envs/experimental/rlm_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,7 @@ async def _handle_sub_llm_request(self, request: Any) -> Any:
reward=None,
advantage=None,
is_truncated=is_truncated,
trajectory_id=f"{batch_id}_{request_id}",
extras={
"is_sub_llm_call": True,
"parent_turn": parent_turn,
Expand Down
1 change: 1 addition & 0 deletions verifiers/envs/multiturn_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ async def add_model_response(
reward=None,
advantage=None,
is_truncated=is_truncated,
trajectory_id=state["trajectory_id"],
extras={},
)
trajectory_step["completion"] = completion_messages
Expand Down
1 change: 1 addition & 0 deletions verifiers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ class TrajectoryStep(TypedDict):
reward: float | None
advantage: float | None
is_truncated: bool
trajectory_id: str
extras: dict[str, Any]


Expand Down
Loading