Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ dependencies = [

[dependency-groups]
dev = [
"ruff",
"ruff>=0.15.0",
"pre-commit",
"ty>=0.0.1a29",
"pytest>=7.0.0",
Expand Down
274 changes: 274 additions & 0 deletions tests/test_prime_eval_command.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
import pytest

import verifiers.cli.commands.eval as eval_command


def test_main_delegates_to_vf_eval_when_not_hosted(monkeypatch):
captured: dict[str, list[str]] = {}

def fake_run_vf_eval(argv: list[str]) -> None:
captured["argv"] = argv

monkeypatch.setattr(eval_command, "_run_vf_eval", fake_run_vf_eval)

eval_command.main(["my-env", "-n", "4"])

assert captured["argv"] == ["my-env", "-n", "4"]


def test_main_rejects_hosted_only_flags_without_hosted():
with pytest.raises(SystemExit) as exc_info:
eval_command.main(["my-env", "--follow"])

assert exc_info.value.code == 2


def test_main_hosted_creates_expected_payload(monkeypatch):
monkeypatch.setattr(
eval_command,
"_load_prime_config",
lambda: {
"base_url": "https://api.primeintellect.ai",
"frontend_url": "https://app.primeintellect.ai",
"api_key": "test-api-key",
},
)

calls: list[dict[str, object]] = []

def fake_request_json(
method: str,
base_url: str,
endpoint: str,
api_key: str,
*,
json_payload=None,
timeout: float = 30.0,
):
calls.append(
{
"method": method,
"base_url": base_url,
"endpoint": endpoint,
"api_key": api_key,
"payload": json_payload,
"timeout": timeout,
}
)
if endpoint.startswith("/environmentshub/"):
return {"data": {"id": "env-123"}}
if endpoint == "/hosted-evaluations":
return {
"evaluation_id": "eval-abc",
"viewer_url": "https://viewer/eval-abc",
}
raise AssertionError(f"unexpected endpoint: {endpoint}")

monkeypatch.setattr(eval_command, "_request_json", fake_request_json)

eval_command.main(
[
"primeintellect/gsm8k",
"--hosted",
"-m",
"openai/gpt-4.1-mini",
"-n",
"10",
"-r",
"2",
"-a",
'{"difficulty":"hard"}',
"--timeout-minutes",
"120",
"--allow-sandbox-access",
"--allow-instances-access",
"--custom-secrets",
'{"API_KEY":"secret"}',
"--eval-name",
"nightly-gsm8k",
]
)

assert len(calls) == 2
assert calls[0]["endpoint"] == "/environmentshub/primeintellect/gsm8k/@latest"

payload = calls[1]["payload"]
assert payload == {
"environment_ids": ["env-123"],
"inference_model": "openai/gpt-4.1-mini",
"eval_config": {
"num_examples": 10,
"rollouts_per_example": 2,
"allow_sandbox_access": True,
"allow_instances_access": True,
"env_args": {"difficulty": "hard"},
"timeout_minutes": 120,
"custom_secrets": {"API_KEY": "secret"},
},
"name": "nightly-gsm8k",
}


@pytest.mark.parametrize(
("display_header", "expected_version"),
[
("primeintellect/wordle", "latest"),
("wordle (local - ahead of primeintellect/wordle)", "latest"),
("primeintellect/wordle@2.0.0", "2.0.0"),
],
)
def test_main_hosted_resolves_slug_from_display_header(
monkeypatch, display_header: str, expected_version: str
):
monkeypatch.setattr(
eval_command,
"_load_prime_config",
lambda: {
"base_url": "https://api.primeintellect.ai",
"frontend_url": "https://app.primeintellect.ai",
"api_key": "test-api-key",
},
)

requested_endpoints: list[str] = []

def fake_request_json(
method: str,
base_url: str,
endpoint: str,
api_key: str,
*,
json_payload=None,
timeout: float = 30.0,
):
requested_endpoints.append(endpoint)
if endpoint.startswith("/environmentshub/"):
return {"data": {"id": "env-456"}}
if endpoint == "/hosted-evaluations":
return {"evaluation_id": "eval-456"}
raise AssertionError(f"unexpected endpoint: {endpoint}")

monkeypatch.setattr(eval_command, "_request_json", fake_request_json)

eval_command.main(
[
"wordle",
"--hosted",
"--header",
f"X-Prime-Eval-Env-Display: {display_header}",
]
)

assert (
requested_endpoints[0]
== f"/environmentshub/primeintellect/wordle/@{expected_version}"
)


def test_main_hosted_supports_toml_config(monkeypatch, tmp_path):
config_path = tmp_path / "evals.toml"
config_path.write_text("[[eval]]\nenv_id='placeholder'\n", encoding="utf-8")

monkeypatch.setattr(
eval_command,
"_load_prime_config",
lambda: {
"base_url": "https://api.primeintellect.ai",
"frontend_url": "https://app.primeintellect.ai",
"api_key": "test-api-key",
},
)
monkeypatch.setattr(
eval_command,
"load_toml_config",
lambda _path: [
{
"env_id": "primeintellect/gsm8k",
"model": "openai/gpt-4.1-mini",
"num_examples": 10,
"rollouts_per_example": 2,
"env_args": {"difficulty": "hard"},
},
{
"env_id": "primeintellect/wordle@2.0.0",
},
],
)
monkeypatch.setattr(
eval_command.vf_eval,
"get_env_eval_defaults",
lambda env_id: (
{"num_examples": 11, "rollouts_per_example": 5}
if env_id == "primeintellect/wordle@2.0.0"
else {}
),
)

calls: list[dict[str, object]] = []

def fake_request_json(
method: str,
base_url: str,
endpoint: str,
api_key: str,
*,
json_payload=None,
timeout: float = 30.0,
):
calls.append(
{
"method": method,
"base_url": base_url,
"endpoint": endpoint,
"api_key": api_key,
"payload": json_payload,
"timeout": timeout,
}
)
if endpoint == "/environmentshub/primeintellect/gsm8k/@latest":
return {"data": {"id": "env-1"}}
if endpoint == "/environmentshub/primeintellect/wordle/@2.0.0":
return {"data": {"id": "env-2"}}
if endpoint == "/hosted-evaluations":
created_count = len([call for call in calls if call["method"] == "POST"])
return {"evaluation_id": f"eval-{created_count}"}
raise AssertionError(f"unexpected endpoint: {endpoint}")

monkeypatch.setattr(eval_command, "_request_json", fake_request_json)

eval_command.main(
[
str(config_path),
"--hosted",
"--custom-secrets",
'{"API_KEY":"secret"}',
]
)

post_payloads = [call["payload"] for call in calls if call["method"] == "POST"]
assert len(post_payloads) == 2
assert calls[0]["endpoint"] == "/environmentshub/primeintellect/gsm8k/@latest"
assert calls[2]["endpoint"] == "/environmentshub/primeintellect/wordle/@2.0.0"
assert post_payloads[0] == {
"environment_ids": ["env-1"],
"inference_model": "openai/gpt-4.1-mini",
"eval_config": {
"num_examples": 10,
"rollouts_per_example": 2,
"allow_sandbox_access": False,
"allow_instances_access": False,
"env_args": {"difficulty": "hard"},
"custom_secrets": {"API_KEY": "secret"},
},
}
assert post_payloads[1] == {
"environment_ids": ["env-2"],
"inference_model": eval_command.vf_eval.DEFAULT_MODEL,
"eval_config": {
"num_examples": 11,
"rollouts_per_example": 5,
"allow_sandbox_access": False,
"allow_instances_access": False,
"custom_secrets": {"API_KEY": "secret"},
},
}
87 changes: 87 additions & 0 deletions tests/test_prime_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from pathlib import Path

import verifiers.cli.plugins.prime as prime_plugin


def _make_workspace(tmp_path: Path) -> tuple[Path, Path]:
workspace = tmp_path / "workspace"
env_dir = workspace / "environments" / "my_env"
env_dir.mkdir(parents=True)
(workspace / "verifiers").mkdir()
(workspace / "pyproject.toml").write_text(
'[project]\nname = "workspace"\nversion = "0.1.0"\n',
encoding="utf-8",
)
return workspace, env_dir


def _touch_python(venv_root: Path) -> Path:
python_bin = prime_plugin._venv_python(venv_root)
python_bin.parent.mkdir(parents=True, exist_ok=True)
python_bin.write_text("", encoding="utf-8")
return python_bin


def test_find_workspace_root_from_nested_environment_dir(tmp_path: Path):
workspace, env_dir = _make_workspace(tmp_path)

assert prime_plugin._find_workspace_root(env_dir) == workspace


def test_resolve_workspace_python_prefers_workspace_venv_over_uv_env(
tmp_path: Path, monkeypatch
):
workspace, env_dir = _make_workspace(tmp_path)
workspace_python = _touch_python(workspace / ".venv")
_touch_python(env_dir / ".venv")

monkeypatch.setattr(prime_plugin, "_python_can_import_module", lambda *_: True)
monkeypatch.setenv("UV_PROJECT_ENVIRONMENT", str(env_dir / ".venv"))
monkeypatch.delenv("VIRTUAL_ENV", raising=False)

assert prime_plugin._resolve_workspace_python(env_dir) == str(workspace_python)


def test_build_module_command_install_adds_workspace_env_path(
tmp_path: Path, monkeypatch
):
workspace, env_dir = _make_workspace(tmp_path)
plugin = prime_plugin.PrimeCLIPlugin()

monkeypatch.setattr(prime_plugin, "_current_cwd", lambda: env_dir)
monkeypatch.setattr(prime_plugin, "_resolve_workspace_python", lambda *_: "python")

command = plugin.build_module_command(plugin.install_module, ["my-env"])

assert command == [
"python",
"-m",
plugin.install_module,
"my-env",
"--path",
str((workspace / "environments").resolve()),
]


def test_build_module_command_eval_rewrites_relative_env_dir_path(
tmp_path: Path, monkeypatch
):
workspace, env_dir = _make_workspace(tmp_path)
plugin = prime_plugin.PrimeCLIPlugin()

monkeypatch.setattr(prime_plugin, "_current_cwd", lambda: env_dir)
monkeypatch.setattr(prime_plugin, "_resolve_workspace_python", lambda *_: "python")

command = plugin.build_module_command(
plugin.eval_module,
["my-env", "--env-dir-path", "./environments"],
)

assert command == [
"python",
"-m",
plugin.eval_module,
"my-env",
"--env-dir-path",
str((workspace / "environments").resolve()),
]
Loading
Loading