Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,6 @@ dependencies = [
"msgpack>=1.1.2",
]

[tool.uv.sources]
prime-tunnel = { git = "https://github.com/PrimeIntellect-ai/prime.git", branch = "feature/tunnel", subdirectory = "packages/prime-tunnel" }
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed git source for prime-tunnel dependency

High Severity

The [tool.uv.sources] section that specified the git source for prime-tunnel was removed, but the package is still listed as a dependency and is actively imported in cli_agent_env.py and rlm_env.py. This change is unrelated to the PR's purpose of improving vf-eval display and will likely cause installation failures if prime-tunnel is not available on PyPI.

Fix in Cursor Fix in Web

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was intentional: the feature/tunnel branch was merged into main and deleted, so verifiers doesn't build anymore with this source in it. But the tunnel was also published now, so we can simply remove the uv.sources and it works.


[dependency-groups]
dev = [
"ruff",
Expand Down
5 changes: 4 additions & 1 deletion verifiers/envs/integrations/browser_env/modes/cua_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,11 @@ def verify_server_connection(self) -> None:
if loop is not None:
import concurrent.futures

def _run_health_check() -> None:
asyncio.run(self._check_server_health())

with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(asyncio.run, self._check_server_health())
future = executor.submit(_run_health_check)
future.result()
else:
asyncio.run(self._check_server_health())
Expand Down
135 changes: 85 additions & 50 deletions verifiers/utils/eval_display.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def elapsed_time(self) -> float:
return end - self.start_time


def _make_histogram(values: list[float], bins: int = 10, width: int = 20) -> Text:
"""Create a simple text histogram of values."""
def _make_histogram(values: list[float], bins: int = 10, height: int = 8) -> Text:
"""Create a simple vertical text histogram of values."""
if not values:
return Text("no data", style="dim")

Expand All @@ -74,16 +74,51 @@ def _make_histogram(values: list[float], bins: int = 10, width: int = 20) -> Tex
counts[bin_idx] += 1

max_count = max(counts)
out = Text()
scaled = [
int(round((c / max_count) * height)) if max_count > 0 else 0 for c in counts
]

label_width = max(
4,
len(f"{min_val:.2f}"),
len(f"{max_val:.2f}"), # keep labels aligned
)
count_width = max(len(str(c)) for c in counts)
col_width = max(label_width, count_width)
spacer = " "
bar_on = "█" * col_width
bar_off = "░" * col_width

out = Text()
# Counts (top row)
for i, count in enumerate(counts):
bin_start = min_val + i * bin_width
bar_len = int((count / max_count) * width) if max_count > 0 else 0
bar = "█" * bar_len + "░" * (width - bar_len)
out.append(str(count).center(col_width), style="dim")
if i < bins - 1:
out.append(spacer)
out.append("\n")

# Bars (top to bottom)
for row in range(height, 0, -1):
for i, h in enumerate(scaled):
if h >= row:
out.append(bar_on, style="cyan")
else:
out.append(bar_off, style="dim")
if i < bins - 1:
out.append(spacer)
out.append("\n")

# Baseline
out.append("─" * (bins * col_width + (bins - 1)), style="dim")
out.append("\n")

out.append(f"{bin_start:5.2f} ", style="dim")
out.append(bar, style="cyan")
out.append(f" {count}\n", style="dim")
# Bin labels (start values)
for i in range(bins):
bin_start = min_val + i * bin_width
label = f"{bin_start:.2f}".center(col_width)
out.append(label, style="dim")
if i < bins - 1:
out.append(spacer)

return out

Expand Down Expand Up @@ -415,7 +450,44 @@ def print_final_summary(self) -> None:
"""Print a comprehensive summary after the display closes."""
self.console.print()

# Summary table with main metrics
# Per-environment detailed sections
for idx, config in enumerate(self.configs):
env_state = self.state.envs[idx]
results = env_state.results

if results is None:
continue

self.console.print()
self.console.print(
Panel(
self._make_env_detail(config, env_state, results),
title=f"[bold blue]{config.env_id}[/bold blue]",
border_style="dim",
)
)

# Print save paths if any
saved_envs = [
(idx, env_state)
for idx, env_state in self.state.envs.items()
if env_state.save_path is not None
]
if saved_envs:
self.console.print()
self.console.print("[bold]Results saved to:[/bold]")
for idx, env_state in saved_envs:
self.console.print(f" [cyan]•[/cyan] {env_state.save_path}")

# Print errors if any
for idx, config in enumerate(self.configs):
env_state = self.state.envs[idx]
if env_state.error:
self.console.print()
self.console.print(f"[red]error in {config.env_id}:[/red]")
self.console.print(f" {env_state.error}")

# Summary table with main metrics (printed last)
table = Table(title="Evaluation Summary")
table.add_column("env_id", style="cyan")
table.add_column("status", justify="center")
Expand Down Expand Up @@ -466,45 +538,8 @@ def print_final_summary(self) -> None:
time_str,
)

self.console.print()
self.console.print(table)

# Per-environment detailed sections
for idx, config in enumerate(self.configs):
env_state = self.state.envs[idx]
results = env_state.results

if results is None:
continue

self.console.print()
self.console.print(
Panel(
self._make_env_detail(config, env_state, results),
title=f"[bold blue]{config.env_id}[/bold blue]",
border_style="dim",
)
)

# Print save paths if any
saved_envs = [
(idx, env_state)
for idx, env_state in self.state.envs.items()
if env_state.save_path is not None
]
if saved_envs:
self.console.print()
self.console.print("[bold]Results saved to:[/bold]")
for idx, env_state in saved_envs:
self.console.print(f" [cyan]•[/cyan] {env_state.save_path}")

# Print errors if any
for idx, config in enumerate(self.configs):
env_state = self.state.envs[idx]
if env_state.error:
self.console.print()
self.console.print(f"[red]error in {config.env_id}:[/red]")
self.console.print(f" {env_state.error}")

self.console.print()

def _make_env_detail(
Expand Down Expand Up @@ -552,7 +587,7 @@ def _make_env_detail(
# All rollouts histogram
all_rollouts_content = Group(
Text("all rollouts:", style="bold"),
_make_histogram(rewards, bins=8, width=25),
_make_histogram(rewards, bins=8, height=8),
)

# Per-example averages if multiple rollouts
Expand All @@ -566,7 +601,7 @@ def _make_env_detail(

per_example_content = Group(
Text("per-example avg:", style="bold"),
_make_histogram(example_avgs, bins=8, width=25),
_make_histogram(example_avgs, bins=8, height=8),
)

# Side by side
Expand Down
33 changes: 22 additions & 11 deletions verifiers/utils/eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import time
from collections import Counter, defaultdict
from collections.abc import Mapping
from contextlib import contextmanager
from contextlib import contextmanager, suppress
from pathlib import Path
from typing import TYPE_CHECKING, cast

Expand Down Expand Up @@ -493,19 +493,30 @@ def on_log(message: str) -> None:
display.update_env_state(env_idx, status="failed", error=str(e))
raise

async def refresh_loop() -> None:
while not display.state.all_completed:
display.refresh()
await asyncio.sleep(1)

try:
async with display:
await asyncio.gather(
*[
run_with_progress(env_config, idx)
for idx, env_config in enumerate(config.evals)
],
return_exceptions=True,
)
refresh_task = asyncio.create_task(refresh_loop())
try:
await asyncio.gather(
*[
run_with_progress(env_config, idx)
for idx, env_config in enumerate(config.evals)
],
return_exceptions=True,
)

display.refresh()
if tui_mode:
await display.wait_for_exit()
display.refresh()
if tui_mode:
await display.wait_for_exit()
finally:
refresh_task.cancel()
with suppress(asyncio.CancelledError):
await refresh_task

except KeyboardInterrupt:
pass # exit on interrupt
Expand Down
Loading