Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
ba59032
pick relevant changes from mika/env-worker
mikasenghaas Jan 28, 2026
0223d62
runnable env server/client
mikasenghaas Jan 28, 2026
84433e2
aligned interface
mikasenghaas Jan 28, 2026
ddc6b2f
integrate into vf-eval
mikasenghaas Jan 28, 2026
dd262de
minor
mikasenghaas Jan 28, 2026
02715b3
do not double serialize
mikasenghaas Jan 28, 2026
86f9276
fix retries
mikasenghaas Jan 28, 2026
a38022a
pass state cols
mikasenghaas Jan 28, 2026
377867e
update pyrproject
mikasenghaas Jan 29, 2026
3cbdea8
update logging_utils
mikasenghaas Jan 29, 2026
9330c81
change signatures from state -> output
mikasenghaas Jan 29, 2026
8584482
move extra env kwargs out of load_environment signatures
mikasenghaas Jan 29, 2026
8b3dbc0
do not change signature
mikasenghaas Jan 29, 2026
0c8badd
mini
mikasenghaas Jan 29, 2026
f5a5ef7
name inner funcs
mikasenghaas Jan 29, 2026
d6edb95
deprecate gen/score sem and move global sem into generate()
mikasenghaas Jan 29, 2026
4e11a27
remove unnecesary module inti
mikasenghaas Jan 29, 2026
846bcc0
fix error info in rollout output
mikasenghaas Jan 29, 2026
0eb5ed7
run as daemon process
mikasenghaas Jan 29, 2026
409f580
robustify task cleanup in env
mikasenghaas Jan 29, 2026
67c25d2
graceful shutdowns
mikasenghaas Jan 29, 2026
dc43e9a
informative error
mikasenghaas Jan 29, 2026
bc7422f
revert
mikasenghaas Jan 29, 2026
d913d3e
remove runner
mikasenghaas Jan 29, 2026
ca191bb
fix tests
mikasenghaas Jan 29, 2026
21ff163
handle extra env kwargs
mikasenghaas Jan 29, 2026
85c7080
remove gen/score concurrency limit from eval interface
mikasenghaas Jan 29, 2026
3d76f72
update docs
mikasenghaas Jan 29, 2026
eb9c1af
handle retries and state cols on server as well
mikasenghaas Jan 29, 2026
775d0c0
fix sampling args handling
mikasenghaas Jan 29, 2026
2f0000d
use kill on second attempt
mikasenghaas Jan 29, 2026
a612201
address bugbot
mikasenghaas Jan 29, 2026
0b8f156
fix
mikasenghaas Jan 29, 2026
ed266db
address bugbot
mikasenghaas Jan 29, 2026
afa81d3
asserts
mikasenghaas Jan 29, 2026
42b90c3
add client idx
mikasenghaas Jan 29, 2026
521e130
quiet server-side env loading
mikasenghaas Jan 29, 2026
7681d92
minor
mikasenghaas Jan 29, 2026
27c2f69
dont assert nvm
mikasenghaas Jan 29, 2026
b52228d
fix eval display
mikasenghaas Jan 29, 2026
e9a6611
do not error pydantic on error response
mikasenghaas Jan 29, 2026
ec5387e
skip validation for rollout inputs (needed for multi-modal to work)
mikasenghaas Jan 29, 2026
09a6cb0
fix mutable args
mikasenghaas Jan 29, 2026
a334e9c
wait for server health
mikasenghaas Jan 29, 2026
3e0051b
lock to prevent race condition
mikasenghaas Jan 29, 2026
6efaa7a
fix logging
mikasenghaas Jan 29, 2026
85988ae
Merge branch 'main' into env-server
mikasenghaas Jan 29, 2026
ed2b7d9
docstring tweak
willccbb Jan 30, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -576,8 +576,6 @@ class EvalConfig(BaseModel):
num_examples: int
rollouts_per_example: int
max_concurrent: int
max_concurrent_generation: int | None = None
max_concurrent_scoring: int | None = None
independent_scoring: bool = False
extra_env_kwargs: dict = {}
max_retries: int = 0
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ dependencies = [
"typing_extensions; python_version < '3.12'",
"wget>=3.2",
"gepa",
"pyzmq>=27.1.0",
"msgpack>=1.1.2",
]

[tool.uv.sources]
Expand Down
9 changes: 3 additions & 6 deletions tests/test_env_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from verifiers import EnvGroup, Rubric, SingleTurnEnv
from verifiers.envs.env_group import EnvGroupRubric
from verifiers.types import State
from verifiers.utils.async_utils import NullAsyncContext


class TestEnvGroupRubric:
Expand Down Expand Up @@ -94,9 +93,8 @@ def func2(completion, **kwargs):
state["oai_tools"] = []
state["reward"] = None
state["metrics"] = None
score_sem = NullAsyncContext()

await rubric.score_rollout(state, score_sem)
await rubric.score_rollout(state)

assert "func1" in state["metrics"]
assert "func2" in state["metrics"]
Expand Down Expand Up @@ -131,9 +129,8 @@ async def test_env_group_rubric_unknown_task(self, mock_openai_client, make_inpu
state["oai_tools"] = []
state["reward"] = None
state["metrics"] = None
score_sem = NullAsyncContext()

await rubric.score_rollout(state, score_sem)
await rubric.score_rollout(state)

assert state["reward"] == 0.0

Expand Down Expand Up @@ -381,7 +378,7 @@ async def test_env_group_generate(self, mock_openai_client, make_input):
# Mock the scoring with a properly-typed cast
from typing import cast

async def mock_score_group(states, score_sem=None):
async def mock_score_group(states):
for state in states:
state["reward"] = 0.8 if state["task"] == "math" else 0.9
state["metrics"] = {}
Expand Down
10 changes: 4 additions & 6 deletions tests/test_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,9 +510,8 @@ async def test_no_retry_after_non_retryable_error(
rollout_outputs = outputs["outputs"]
assert env.call_counts[0] == 1 # No retries for non-retryable error
assert rollout_outputs[0].get("error") is not None
assert (
"ToolError" in rollout_outputs[0]["error"]
) # Error is serialized as repr string
error_info = rollout_outputs[0]["error"]
assert "ToolError" == error_info["error"]

@pytest.mark.asyncio
async def test_error_in_state_after_max_retries_exhausted(
Expand All @@ -532,9 +531,8 @@ async def test_error_in_state_after_max_retries_exhausted(
rollout_outputs = outputs["outputs"]
assert env.call_counts[0] == 3 # 1 initial + 2 retries
assert rollout_outputs[0].get("error") is not None
assert (
"InfraError" in rollout_outputs[0]["error"]
) # Error is serialized as repr string
error_info = rollout_outputs[0]["error"]
assert "InfraError" == error_info["error"]


class TestEmptyModelResponseErrors:
Expand Down
2 changes: 0 additions & 2 deletions tests/test_eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ def _run_cli(monkeypatch, overrides, capture_all_configs: bool = False):
"num_examples": 1,
"rollouts_per_example": 1,
"max_concurrent": 1,
"max_concurrent_generation": None,
"max_concurrent_scoring": None,
"independent_scoring": False,
"max_tokens": 42,
"temperature": 0.9,
Expand Down
10 changes: 3 additions & 7 deletions tests/test_math_rubric.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import pytest

import verifiers as vf
from verifiers.utils.async_utils import NullAsyncContext


class TestMathRubric:
Expand Down Expand Up @@ -55,9 +54,8 @@ async def test_score_valid_answers(self, test_case, make_input):
"total_ms": 0.0,
"start_time": 0.0,
}
score_sem = NullAsyncContext()

await rubric.score_rollout(state, score_sem)
await rubric.score_rollout(state)

assert state["metrics"]["correct_answer"] == 1.0

Expand Down Expand Up @@ -90,9 +88,8 @@ async def test_score_invalid_answers(self, test_case, make_input):
"total_ms": 0.0,
"start_time": 0.0,
}
score_sem = NullAsyncContext()

await rubric.score_rollout(state, score_sem)
await rubric.score_rollout(state)

assert state["metrics"]["correct_answer"] == 0.0

Expand Down Expand Up @@ -122,10 +119,9 @@ async def test_timeout(self, timeout_seconds, make_input):
"total_ms": 0.0,
"start_time": 0.0,
}
score_sem = NullAsyncContext()

start_time = time.time()
await rubric.score_rollout(state, score_sem)
await rubric.score_rollout(state)
end_time = time.time()
elapsed_time = end_time - start_time
assert state["metrics"]["correct_answer"] == 0.0
Expand Down
28 changes: 9 additions & 19 deletions tests/test_rubric.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from verifiers import Parser, Rubric
from verifiers.types import RewardFunc, RolloutInput, State
from verifiers.utils.async_utils import NullAsyncContext


class TestRubric:
Expand Down Expand Up @@ -171,9 +170,8 @@ def func2(completion, **kwargs):
"total_ms": 0.0,
"start_time": 0.0,
}
score_sem = NullAsyncContext()

await rubric.score_rollout(state, score_sem)
await rubric.score_rollout(state)

assert "func1" in state["metrics"]
assert "func2" in state["metrics"]
Expand Down Expand Up @@ -211,9 +209,8 @@ def list_func(completion, **kwargs):
"total_ms": 0.0,
"start_time": 0.0,
}
score_sem = NullAsyncContext()

await rubric.score_rollout(state, score_sem)
await rubric.score_rollout(state)

assert state["metrics"]["list_func"] == 2.0 # Length of completion list
assert state["reward"] == 2.0
Expand Down Expand Up @@ -266,8 +263,7 @@ def length_func(completion, **kwargs):
"start_time": 0.0,
}

score_sem = NullAsyncContext()
await rubric.score_group(states, score_sem)
await rubric.score_group(states)

assert states[0]["metrics"]["accuracy_func"] == 1.0
assert states[1]["metrics"]["accuracy_func"] == 1.0
Expand Down Expand Up @@ -304,9 +300,8 @@ def func2(completion, **kwargs):
"total_ms": 0.0,
"start_time": 0.0,
}
score_sem = NullAsyncContext()

await rubric.score_group([state], score_sem)
await rubric.score_group([state])

# Weighted sum: 1.0*2.0 + 0.5*3.0 = 3.5
assert state["reward"] == pytest.approx(3.5)
Expand All @@ -319,10 +314,9 @@ def test_func(completion, **kwargs):
return 1.0

rubric = Rubric(funcs=[test_func], weights=[1.0])
score_sem = NullAsyncContext()

# score_group with empty list should handle gracefully
await rubric.score_group([], score_sem)
await rubric.score_group([])

@pytest.mark.asyncio
async def test_score_rollouts_with_default_infos(self):
Expand All @@ -349,9 +343,8 @@ def simple_func(completion, **kwargs):
"total_ms": 0.0,
"start_time": 0.0,
}
score_sem = NullAsyncContext()

await rubric.score_group([state], score_sem)
await rubric.score_group([state])

assert "simple_func" in state["metrics"]
assert state["metrics"]["simple_func"] == 1.0
Expand Down Expand Up @@ -388,9 +381,8 @@ def scalar_func(completion, **kwargs):
"total_ms": 0.0,
"start_time": 0.0,
}
score_sem = NullAsyncContext()

await rubric.score_group([state], score_sem)
await rubric.score_group([state])

assert state["metrics"]["scalar_func"] == 0.5
assert state["reward"] == 0.5
Expand Down Expand Up @@ -425,9 +417,8 @@ def f_with_kwargs(completion, **kwargs):
"total_ms": 0.0,
"start_time": 0.0,
}
score_sem = NullAsyncContext()

await rubric.score_rollout(state, score_sem)
await rubric.score_rollout(state)

# Weighted sum: 0.5*1 + 1.0*2 = 2.5
assert state["reward"] == pytest.approx(2.5)
Expand Down Expand Up @@ -464,9 +455,8 @@ def g2(**kwargs):
"total_ms": 0.0,
"start_time": 0.0,
}
score_sem = NullAsyncContext()

await rubric.score_rollout(state, score_sem)
await rubric.score_rollout(state)

assert state["reward"] == pytest.approx(0.5)
assert calls == ["g1", "g2"] # order respected
Expand Down
22 changes: 7 additions & 15 deletions tests/test_rubric_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from verifiers import Rubric, RubricGroup, XMLParser
from verifiers.types import RolloutInput, RolloutTiming, State
from verifiers.utils.async_utils import NullAsyncContext


class TestRubricGroup:
Expand Down Expand Up @@ -163,8 +162,7 @@ def func2(completion, **kwargs):
start_time=0.0,
)

score_sem = NullAsyncContext()
await group.score_group([state], score_sem)
await group.score_group([state])

# Should have scores from both rubrics
assert "func1" in state["metrics"]
Expand Down Expand Up @@ -203,8 +201,7 @@ def func1(completion, **kwargs):
start_time=0.0,
)

score_sem = NullAsyncContext()
await group.score_group([state], score_sem)
await group.score_group([state])

# Should have summed scores for duplicate function names
assert "func1" in state["metrics"]
Expand Down Expand Up @@ -244,8 +241,7 @@ def func1(completion, info=None, **kwargs):
start_time=0.0,
)

score_sem = NullAsyncContext()
await group.score_group([state], score_sem)
await group.score_group([state])

# Should pass custom kwargs to reward functions
assert "func1" in state["metrics"]
Expand Down Expand Up @@ -282,8 +278,7 @@ def func1(completion, **kwargs):
start_time=0.0,
)

score_sem = NullAsyncContext()
await group.score_group([state], score_sem)
await group.score_group([state])

# Should work with single rubric
assert "func1" in state["metrics"]
Expand All @@ -302,10 +297,9 @@ def func1(completion, **kwargs):

# Test with empty data - should handle gracefully
states = []
score_sem = NullAsyncContext()
# Empty states should not cause errors
try:
await group.score_group(states, score_sem)
await group.score_group(states)
except ZeroDivisionError:
pytest.skip("score_group doesn't handle empty states yet")

Expand Down Expand Up @@ -368,8 +362,7 @@ def func1(completion, **kwargs):
start_time=0.0,
)

score_sem = NullAsyncContext()
await group.score_group(states, score_sem)
await group.score_group(states)

# Should work with multiple states
assert "func1" in states[0]["metrics"]
Expand Down Expand Up @@ -420,8 +413,7 @@ def reward_func(completion, parser, answer, **_):
start_time=0.0,
)

score_sem = NullAsyncContext()
await group.score_rollout(state, score_sem)
await group.score_rollout(state)

assert state["reward"] == 1.0
assert recorded_parsers == [xml_parser]
8 changes: 3 additions & 5 deletions verifiers/envs/env_group.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import time
from typing import TYPE_CHECKING, AsyncContextManager, Mapping, final
from typing import TYPE_CHECKING, Mapping, final

from openai import AsyncOpenAI

Expand Down Expand Up @@ -38,7 +38,6 @@ def _get_reward_func_names(self) -> list[str]:
async def score_rollout(
self,
state: vf.State,
score_sem: AsyncContextManager,
) -> None:
"""
Evaluate all reward functions in-place for a single rollout.
Expand All @@ -57,7 +56,7 @@ async def score_rollout(
state["metrics"] = metrics
return

await env.rubric.score_rollout(state, score_sem=score_sem)
await env.rubric.score_rollout(state)
env_reward = state.get("reward", 0.0)
env_metrics = state.get("metrics", {}).copy() if state.get("metrics") else {}

Expand All @@ -72,7 +71,6 @@ async def score_rollout(
async def score_group(
self,
states: list[vf.State],
score_sem: AsyncContextManager,
) -> None:
"""
Score a group of rollouts, routing to appropriate environment rubrics based on task.
Expand All @@ -95,7 +93,7 @@ async def score_group(
return

# Score all states using the environment's rubric
await env.rubric.score_group(states, score_sem=score_sem)
await env.rubric.score_group(states)

# Initialize metrics dict with all reward function names
aggregated_metrics: dict[str, list[float]] = {
Expand Down
Loading
Loading