Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
70 commits
Select commit Hold shift + click to select a range
746ea9c
simple multi eval scaffolding via toml config
mikasenghaas Jan 15, 2026
8be1b4a
add debug config
mikasenghaas Jan 15, 2026
138d30e
demote to debug log
mikasenghaas Jan 15, 2026
b43e337
move around logs
mikasenghaas Jan 15, 2026
64e68f5
fix tests
mikasenghaas Jan 15, 2026
8e9f335
support comma-separated list
mikasenghaas Jan 15, 2026
f84ac0e
fix precedence
mikasenghaas Jan 15, 2026
681ebfb
minor
mikasenghaas Jan 15, 2026
f7118f6
fix schema validation
mikasenghaas Jan 15, 2026
8a1da80
minor fix
mikasenghaas Jan 15, 2026
1a1f278
update tests
mikasenghaas Jan 15, 2026
e49c648
add unit tests
mikasenghaas Jan 15, 2026
9716fc4
revert pbar desc
mikasenghaas Jan 15, 2026
26416b8
update docs
mikasenghaas Jan 15, 2026
27b65fa
typo
mikasenghaas Jan 15, 2026
3979923
fix mutation
mikasenghaas Jan 15, 2026
ce63f9b
validation for env ids
mikasenghaas Jan 15, 2026
ad47e3f
fix resolution issue
mikasenghaas Jan 15, 2026
6c047c9
move debug config
mikasenghaas Jan 16, 2026
92b7a77
poc vf-eval tui
mikasenghaas Jan 15, 2026
8a2d8ba
exit on input
mikasenghaas Jan 15, 2026
9790f68
streaming works
mikasenghaas Jan 15, 2026
d6243d1
full width boxes
mikasenghaas Jan 15, 2026
f6f5e27
make env id part of border
mikasenghaas Jan 15, 2026
f1eb16b
remove header
mikasenghaas Jan 15, 2026
a363847
use static env config
mikasenghaas Jan 15, 2026
ab716f6
remove redundant info
mikasenghaas Jan 15, 2026
4be27f2
show running avg of all metrics
mikasenghaas Jan 15, 2026
c9fac9d
spacing
mikasenghaas Jan 15, 2026
c39d265
ckpt
mikasenghaas Jan 15, 2026
4e488c1
fix
mikasenghaas Jan 15, 2026
10f491b
final summary + stack
mikasenghaas Jan 15, 2026
fdf7072
remove global progress
mikasenghaas Jan 15, 2026
02d894c
spacing
mikasenghaas Jan 15, 2026
d6ae255
unify progress callback behavior
mikasenghaas Jan 15, 2026
087aab9
show gen/sem concurrency
mikasenghaas Jan 15, 2026
3a86bc1
show sampling args
mikasenghaas Jan 15, 2026
8e59a13
show saved results path
mikasenghaas Jan 15, 2026
acb7174
formatting
mikasenghaas Jan 15, 2026
bdf5415
remove print_results
mikasenghaas Jan 15, 2026
6b2d641
show -1 concurrency with infinite
mikasenghaas Jan 15, 2026
e6dae48
fix
mikasenghaas Jan 15, 2026
51de374
on log callback
mikasenghaas Jan 15, 2026
e8122ab
show save every
mikasenghaas Jan 15, 2026
d3b8743
fix tests
mikasenghaas Jan 15, 2026
76ee16e
resolve num_examples=-1
mikasenghaas Jan 15, 2026
62a8423
show error
mikasenghaas Jan 15, 2026
7f956c3
cosmetics
mikasenghaas Jan 15, 2026
3e4143d
remove global pbar
mikasenghaas Jan 15, 2026
b503341
refactor progress
mikasenghaas Jan 15, 2026
449346b
refactor accums
mikasenghaas Jan 15, 2026
d78c6d7
fix progress bar
mikasenghaas Jan 15, 2026
0b9cf1d
minor
mikasenghaas Jan 15, 2026
0ccb5e0
minor
mikasenghaas Jan 15, 2026
352b86b
cleanup
mikasenghaas Jan 15, 2026
d910314
fix linter
mikasenghaas Jan 15, 2026
c5144da
cleanup
mikasenghaas Jan 15, 2026
41001dc
resolve num examples diff
mikasenghaas Jan 15, 2026
7c35822
fix
mikasenghaas Jan 15, 2026
3a7fe48
mc
willccbb Jan 21, 2026
b3e7364
tweaks to rendering to avoid scroll issues; configs
willccbb Jan 21, 2026
305da44
remove old config
willccbb Jan 21, 2026
76c969d
merge bug fixes
willccbb Jan 21, 2026
b70a15c
docs; logging tweak
willccbb Jan 21, 2026
edcadd2
revert logging change
willccbb Jan 21, 2026
b50c827
do not exit if no metrics
mikasenghaas Jan 21, 2026
4525353
show avg reward correctly
mikasenghaas Jan 21, 2026
f175aa5
use env_idx to allow eval'ing the same env_id multiple times
mikasenghaas Jan 21, 2026
7c0ee87
guarantee metrics is dict
mikasenghaas Jan 21, 2026
e2da252
simplify
mikasenghaas Jan 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions configs/eval/debug.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
model = "openai/gpt-5-mini"
model = "openai/gpt-4.1-mini"
save_results = true
save_every = 10

[[eval]]
env_id = "primeintellect/wiki-search"

[[eval]]
env_id = "gsm8k"
num_examples = 20
rollouts_per_example = 1
sampling_args = { max_tokens = 1024 }
independent_scoring = true

[[eval]]
env_id = "primeintellect/math-python"
env_id = "alphabet-sort"
6 changes: 6 additions & 0 deletions configs/eval/duplicate-env.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[[eval]]
env_id = "alphabet-sort"

[[eval]]
env_id = "alphabet-sort"
max_concurrent = 1
20 changes: 20 additions & 0 deletions configs/eval/single-turn.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[[eval]]
env_id = "math500"
num_examples = -1
rollouts_per_example = 1

[[eval]]
env_id = "aime2024"
num_examples = -1
rollouts_per_example = 8

[[eval]]
env_id = "gpqa"
num_examples = -1
rollouts_per_example = 1

[[eval]]
env_id = "livecodebench"
num_examples = -1
rollouts_per_example = 1
max_concurrent = 16 # to limit sandbox usage
1 change: 1 addition & 0 deletions docs/evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ The `--max-retries` flag enables automatic retry with exponential backoff when r
| Flag | Short | Default | Description |
|------|-------|---------|-------------|
| `--verbose` | `-v` | false | Enable debug logging |
| `--tui` | — | false | Show live-updating TUI for multi-env evals |
| `--save-results` | `-s` | false | Save results to disk |
| `--save-every` | `-f` | -1 | Save checkpoint every N rollouts |
| `--state-columns` | `-C` | — | Extra state columns to save (comma-separated) |
Expand Down
4 changes: 2 additions & 2 deletions tests/test_eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ def _run_cli(monkeypatch, overrides, capture_all_configs: bool = False):
"temperature": 0.9,
"sampling_args": None,
"verbose": False,
"print_results": False,
"no_interleave_scoring": False,
"state_columns": [],
"save_results": False,
Expand All @@ -120,6 +119,7 @@ def _run_cli(monkeypatch, overrides, capture_all_configs: bool = False):
"hf_hub_dataset_name": "",
"extra_env_kwargs": {},
"max_retries": 0,
"tui": False,
}
base_args.update(overrides)
args_namespace = SimpleNamespace(**base_args)
Expand All @@ -134,7 +134,7 @@ def _run_cli(monkeypatch, overrides, capture_all_configs: bool = False):
monkeypatch.setattr(vf_eval, "setup_logging", lambda *_, **__: None)
monkeypatch.setattr(vf_eval, "load_endpoints", lambda *_: {})

async def fake_run_evaluation(config):
async def fake_run_evaluation(config, **kwargs):
captured["sampling_args"] = dict(config.sampling_args)
captured["configs"].append(config)
metadata = _make_metadata(config)
Expand Down
29 changes: 26 additions & 3 deletions verifiers/envs/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,15 @@
DatasetBuilder,
GenerateMetadata,
GenerateOutputs,
LogCallback,
Messages,
MessageType,
ModelResponse,
ProgressCallback,
RolloutInput,
RolloutTiming,
SamplingArgs,
StartCallback,
State,
)
from verifiers.utils.async_utils import maybe_retry, maybe_semaphore
Expand Down Expand Up @@ -866,6 +869,9 @@ async def generate(
use_tqdm: bool = True,
independent_scoring: bool = False,
max_retries: int = 0,
on_start: StartCallback | None = None,
on_progress: ProgressCallback | None = None,
on_log: LogCallback | None = None,
) -> GenerateOutputs:
"""
Generate rollouts for a set of inputs.
Expand All @@ -875,6 +881,10 @@ async def generate(
elif isinstance(inputs, list):
inputs_list = inputs

# notify caller of actual total count (useful when num_examples=-1)
if on_start is not None:
on_start(len(inputs_list))

# resolve concurrency knobs
gen_limit = max_concurrent_generation
score_limit = max_concurrent_scoring
Expand Down Expand Up @@ -936,9 +946,9 @@ async def generate(
pbar_total = len(group_list)
pbar_desc = f"Processing {len(group_list)} groups ({len(inputs_list)} total rollouts)"

# set up progress bar
# set up progress bar (only when use_tqdm=True and no external progress callback)
pbar = None
if use_tqdm:
if use_tqdm and on_progress is None:
from tqdm import tqdm

pbar = tqdm(total=pbar_total, desc=pbar_desc, postfix=dict(reward="?"))
Expand All @@ -962,10 +972,13 @@ async def generate(
reward_sum += r
reward_count += 1

# update progress bar or call callback
if pbar is not None:
pbar.update(1)
if reward_count > 0:
pbar.set_postfix(reward=f"{reward_sum / reward_count:.3f}")
elif on_progress is not None:
on_progress(all_states, states)

# save intermediate results
if (
Expand Down Expand Up @@ -1003,9 +1016,11 @@ async def generate(
start_time,
)

# Save if requested
# save if requested
if save_results:
save_rollout_results(results)
if on_log is not None:
on_log(f"Saved final results to {results['metadata']['path_to_save']}")

return results

Expand Down Expand Up @@ -1070,8 +1085,12 @@ async def evaluate(
state_columns: list[str] | None = None,
save_results: bool = False,
save_every: int = -1,
use_tqdm: bool = True,
independent_scoring: bool = False,
max_retries: int = 0,
on_start: StartCallback | None = None,
on_progress: ProgressCallback | None = None,
on_log: LogCallback | None = None,
**kwargs,
) -> GenerateOutputs:
"""
Expand All @@ -1090,8 +1109,12 @@ async def evaluate(
state_columns=state_columns,
save_results=save_results,
save_every=save_every,
use_tqdm=use_tqdm,
independent_scoring=independent_scoring,
max_retries=max_retries,
on_start=on_start,
on_progress=on_progress,
on_log=on_log,
**kwargs,
)

Expand Down
13 changes: 12 additions & 1 deletion verifiers/scripts/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
load_endpoints,
load_toml_config,
run_evaluations,
run_evaluations_tui,
)
from verifiers.utils.install_utils import check_hub_env_installed

Expand Down Expand Up @@ -253,6 +254,13 @@ def main():
default={},
help='Extra environment as JSON object (e.g., \'{"key": "value", "num": 42}\'). Passed to environment constructor.',
)
parser.add_argument(
"--tui",
"-u",
default=False,
action="store_true",
help="Use TUI mode for live evaluation display",
)
parser.add_argument(
"--max-retries",
type=int,
Expand Down Expand Up @@ -423,7 +431,10 @@ def build_eval_config(raw: dict) -> EvalConfig:
logger.debug(f"Evaluation config: {config.model_dump_json(indent=2)}")

eval_run_config = EvalRunConfig(evals=eval_configs)
asyncio.run(run_evaluations(eval_run_config))
if args.tui:
asyncio.run(run_evaluations_tui(eval_run_config))
else:
asyncio.run(run_evaluations(eval_run_config))


if __name__ == "__main__":
Expand Down
6 changes: 6 additions & 0 deletions verifiers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,11 @@ def get(self, key: str, default: Any = None) -> Any:
# oai tools
JsonPrimitive = Literal["string", "number", "integer", "boolean", "array", "object"]

# callbacks
StartCallback = Callable[[int], None] # total rollouts
ProgressCallback = Callable[[list[State], list[State]], None] # all_states, new_states
LogCallback = Callable[[str], None] # log messages


class GenerateMetadata(TypedDict):
"""Pydantic model for generation metadata."""
Expand Down Expand Up @@ -241,6 +246,7 @@ class EvalConfig(BaseModel):
max_retries: int = 0
# logging
verbose: bool = False
use_tqdm: bool = True
# saving
state_columns: list[str] | None = None
save_results: bool = False
Expand Down
Loading
Loading