Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 12 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ The following named attributes available for use by reward functions in your Rub
- `prompt`: sequence of input messages
- `completion`: sequence of messages generated during rollout by model and Environment
- `answer`: primary answer column, optional (defaults to empty string if omitted)
- `state`: can be modified during rollout to accumulate any metadata (`state['responses']` includes full OpenAI response objects by default)
- `state`: can be modified during rollout to accumulate any metadata (`state['trajectory']` includes the full list of `TrajectoryStep` objects by default)
- `info`: auxiliary info needed for reward computation (e.g. test cases), optional (defaults to empty dict if omitted)
- `task`: tag for task type (used by `EnvGroup` and `RubricGroup`)
- `parser`: the parser object declared. Note: `vf.Parser().get_format_reward_func()` is a no-op (always 1.0); use `vf.ThinkParser` or a custom parser if you want a real format adherence reward.
Expand Down Expand Up @@ -275,28 +275,27 @@ For training, or self-hosted endpoints, you'll want to enable auto tool choice i

### MultiTurnEnv

Both `SingleTurnEnv` and `ToolEnv` are instances of `MultiTurnEnv`, which exposes an interface for writing custom Environment interaction protocols. Override `is_completed` and `env_response`, and make sure any custom completion logic defers to the base class so turn limits and other shared guards keep working.
Both `SingleTurnEnv` and `ToolEnv` are instances of `MultiTurnEnv`, which exposes an interface for writing custom Environment interaction protocols. To implement a custom protocol, define an `env_response` method and use `@vf.stop` decorators for termination conditions.

```python
from typing import Tuple
import verifiers as vf
from verifiers.types import Messages, State

class YourMultiTurnEnv(vf.MultiTurnEnv):
def __init__(self,
dataset: Dataset,
rubric: Rubric,
max_turns: int,
**kwargs):

async def is_completed(self, messages: Messages, state: State, **kwargs) -> bool:
# Always call the base check so max_turns and shared guards are respected
if await super().is_completed(messages, state, **kwargs):
return True
# return whether or not a rollout is completed
return state.get("task_complete", False)

async def env_response(self, messages: Messages, state: State, **kwargs) -> Tuple[Messages, State]:
# return new environment message(s) + updated state

async def env_response(self, messages: Messages, state: State, **kwargs) -> Messages:
# return new environment message(s); state can be updated in-place
return [{"role": "user", "content": "feedback"}]

@vf.stop
async def task_complete(self, state: State) -> bool:
# return whether or not a rollout is completed
return state.get("task_complete", False)
```

If your application requires more fine-grained control than is allowed by `MultiTurnEnv`, you may want to inherit from the base `Environment` functionality directly and override the `rollout` method.
Expand Down
49 changes: 22 additions & 27 deletions docs/source/components.md
Original file line number Diff line number Diff line change
Expand Up @@ -470,49 +470,47 @@ Build a Wordle-like game with multi-turn interaction:

```python
from verifiers.types import Messages, State
from typing import Tuple

class WordleEnv(vf.MultiTurnEnv):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.max_guesses = 6

def env_response(self, messages: Messages, state: State) -> Tuple[Messages, State]:
if state.get("turn", 0) == 0:
async def env_response(self, messages: Messages, state: State) -> Messages:
if len(state["trajectory"]) == 0:
# First turn: initialize
state["turn"] = 1
state["target"] = state["answer"]
state["guesses"] = []
return [{"role": "user", "content": "Guess a 5-letter word. You have 6 attempts."}], state
return [{"role": "user", "content": "Guess a 5-letter word. You have 6 attempts."}]

# Get the last assistant message
last_msg = messages[-1]
if last_msg["role"] != "assistant":
return [], state # No response if not assistant message
return [] # No response if not assistant message

guess = last_msg["content"].strip().upper()
target = state["target"]

# Validate guess
if len(guess) != 5 or not guess.isalpha():
return [{"role": "user", "content": "Please guess a 5-letter word."}], state
return [{"role": "user", "content": "Please guess a 5-letter word."}]

# Generate feedback
feedback = self.get_feedback(guess, target)
state["guesses"].append(guess)
state["turn"] += 1

if guess == target:
state["solved"] = True
return [{"role": "user", "content": f"Correct! The word was {target}."}], state
elif state["turn"] > self.max_guesses:
return [{"role": "user", "content": f"Correct! The word was {target}."}]
elif len(state["guesses"]) >= self.max_guesses:
state["failed"] = True
return [{"role": "user", "content": f"Out of guesses. The word was {target}."}], state
return [{"role": "user", "content": f"Out of guesses. The word was {target}."}]
else:
remaining = self.max_guesses - state["turn"] + 1
return [{"role": "user", "content": f"{feedback}\n{remaining} guesses remaining."}], state
remaining = self.max_guesses - len(state["guesses"])
return [{"role": "user", "content": f"{feedback}\n{remaining} guesses remaining."}]

def is_completed(self, messages: Messages, state: State) -> bool:
@vf.stop
async def game_over(self, state: State) -> bool:
return state.get("solved", False) or state.get("failed", False)
```

Expand All @@ -521,9 +519,12 @@ class WordleEnv(vf.MultiTurnEnv):
Generate training data using environment rollouts:

```python
import asyncio

async def generate_training_data(env, client, model, num_samples=1000):
"""Generate diverse solutions for training."""
results = []
score_sem = asyncio.Semaphore(1) # Semaphore for scoring

for i in range(num_samples):
# Get a random prompt
Expand All @@ -532,25 +533,19 @@ async def generate_training_data(env, client, model, num_samples=1000):

# Generate multiple solutions
for temp in [0.3, 0.7, 1.0]:
completion, state = await env.rollout(
client=client,
model=model,
prompt=prompt,
answer=answer,
sampling_args={"temperature": temp, "max_tokens": 1000}
)
input = {"prompt": prompt, "answer": answer,
"task": "default", "example_id": i}
state = await env.rollout(input=input, client=client, model=model, sampling_args={"temperature": temp, "max_tokens": 1000})

# Score the solution
rewards = await env.rubric.score_rollout(
prompt, completion, answer, state
)
await env.rubric.score_rollout(state, score_sem)

# Save high-quality solutions
if rewards["total"] > 0.8:
if state.get("reward") and state["reward"] > 0.8:
results.append({
"prompt": prompt,
"completion": completion,
"score": rewards["total"]
"completion": state["completion"],
"score": state["reward"]
})

return Dataset.from_list(results)
Expand Down
50 changes: 31 additions & 19 deletions docs/source/concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,15 @@ from verifiers.types import Messages, State
from typing import Tuple

class MyProtocol(vf.MultiTurnEnv):
async def env_response(self, messages: Messages, state: State) -> Tuple[Messages, State]:
async def env_response(self, messages: Messages, state: State) -> Messages:
"""Define how environment responds to model"""
response = [{"role": "user", "content": "Environment feedback"}]
state["turn"] = state.get("turn", 0) + 1
return response, state
return response

async def is_completed(self, messages: Messages, state: State) -> bool:
@vf.stop
async def task_complete(self, state: State) -> bool:
"""Define when interaction ends"""
# Always defer to the base implementation so turn limits are respected
if await super().is_completed(messages, state):
return True
return state.get("task_complete", False)
```

Expand Down Expand Up @@ -143,23 +141,36 @@ Environments maintain state throughout interactions:

```python
state = {
# automatically managed
"prompt": prompt, # inputs from dataset
"completion": [], # trajectory so far
"answer": answer, # golden answer (str)
"task": task, # optional environment ID column
"info": info, # evaluation metadata (dict) -- can use answer/info/both
"responses": [], # Raw API responses from OpenAI client
"example_id": example_id, # Source dataset row identifier
"turn": 0,
"timing": {"generation_ms": 0.0, "scoring_ms": 0.0, "total_ms": 0.0},
# Input fields (automatically managed)
"prompt": prompt, # Inputs from dataset (list[ChatMessage] or str)
"answer": answer, # Golden answer (str, optional)
"task": task, # Environment ID column (str, optional)
"info": info, # Evaluation metadata (dict, optional) -- can use answer/info/both
"example_id": example_id, # Source dataset row identifier (int)

# Rollout tracking (automatically managed):
"trajectory": [], # Trajectory steps (list[TrajectoryStep], one per LLM request/response)
"completion": None, # Full conversation except the initial prompt (list[ChatMessage] or str),
# rendered from trajectory when rollout ends
"is_completed": False, # Whether rollout has terminated (bool)
"stop_condition": None, # Name of stop condition that terminated rollout (str, optional)
"timing": {"generation_ms": 0.0, "scoring_ms": 0.0, "total_ms": 0.0}, # Timing info (dict)

# custom user-managed state
"lives_remaining": 2,
"inventory": {"potion": 1, "power-up": 2}
...
}
```

**Trajectory Structure**: Each step in `state["trajectory"]` is a `TrajectoryStep` containing:
- `prompt`: Messages sent to LLM for this request
- `completion`: Messages returned from LLM for this request
- `response`: Raw API response object
- `tokens`: Token IDs, masks, and logprobs (if available)
- `reward`: Reward for this step
- `advantage`: Advantage for this step (for RL training)

A wide variety of complex interaction protocols, reward schemes, and training algorithms can be coordinated via tracking appropriate data in `state`.

## Design Philosophy
Expand Down Expand Up @@ -224,11 +235,12 @@ results = asyncio.run(env.evaluate(client=async_client, model="llama-3.1-8b"))
```
- `rollouts_per_example > 1` repeats dataset entries internally.
- `max_concurrent` throttles concurrent rollouts.
- `save_every` (when > 0) checkpoints intermediate progress during interleaved rollouts (set `interleave_scoring=True`).
- `save_every` (when > 0) checkpoints intermediate progress during rollouts.

- **Scoring**:
- Each reward function returns a float. Weights applied inside `Rubric` combine them into `results.reward`.
- All individual scores are logged under `results.metrics` keyed by function name (even if weight is 0.0).
- Each reward function returns a float. Weights applied inside `Rubric` combine them into `state["reward"]`.
- All individual scores are logged under `state["metrics"]` keyed by function name (even if weight is 0.0).
- Scoring is performed at the group level by default, parallelizing across rollouts.

- **Outputs** (`GenerateOutputs`):
- `prompt`, `completion`, `answer`, `state`, `info`, `task`, `id`, `reward`, `metrics: dict[str, list[float]]`, plus a `metadata` block summarizing the run.
Expand Down
2 changes: 0 additions & 2 deletions docs/source/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,6 @@ def test_with_mock(mock_client):
4. **Group related tests** in test classes
5. **Keep tests fast** - use mocks instead of real API calls

> **Tip:** When subclassing `MultiTurnEnv`, always call `await super().is_completed(...)` (or `await self.max_turns_reached(state)`) so shared guards—especially max turn limits—remain effective.

## Contributing

### Workflow
Expand Down
14 changes: 7 additions & 7 deletions docs/source/environments.md
Original file line number Diff line number Diff line change
Expand Up @@ -336,24 +336,24 @@ from typing import Tuple

class MyGameEnv(vf.MultiTurnEnv):

async def env_response(self, messages: Messages, state: State) -> Tuple[Messages, State]:
async def env_response(self, messages: Messages, state: State) -> Messages:
"""Define how the environment responds."""
last_msg = messages[-1]
if last_msg["role"] != "assistant":
return [], state
return []

player_action = last_msg["content"]
if self.is_game_over(state):
state["done"] = True
return [{"role": "user", "content": "Game over!"}], state
return [{"role": "user", "content": "Game over!"}]

state = self.update_state(state, player_action)
feedback = self.get_game_feedback(state)
return [{"role": "user", "content": feedback}], state
return [{"role": "user", "content": feedback}]

async def is_completed(self, messages: Messages, state: State) -> bool:
if await super().is_completed(messages, state):
return True
@vf.stop
async def game_over(self, state: State) -> bool:
"""Check if game is complete."""
return state.get("solved", False) or state.get("failed", False)
```

Expand Down
Loading