PrimeIntellect-ai · willccbb · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,7 +53,7 @@ dependencies = [
 
 [dependency-groups]
 dev = [
-    "ruff",
+    "ruff>=0.15.0",
     "pre-commit",
     "ty>=0.0.1a29",
     "pytest>=7.0.0",

diff --git a/tests/test_prime_eval_command.py b/tests/test_prime_eval_command.py
@@ -0,0 +1,274 @@
+import pytest
+
+import verifiers.cli.commands.eval as eval_command
+
+
+def test_main_delegates_to_vf_eval_when_not_hosted(monkeypatch):
+    captured: dict[str, list[str]] = {}
+
+    def fake_run_vf_eval(argv: list[str]) -> None:
+        captured["argv"] = argv
+
+    monkeypatch.setattr(eval_command, "_run_vf_eval", fake_run_vf_eval)
+
+    eval_command.main(["my-env", "-n", "4"])
+
+    assert captured["argv"] == ["my-env", "-n", "4"]
+
+
+def test_main_rejects_hosted_only_flags_without_hosted():
+    with pytest.raises(SystemExit) as exc_info:
+        eval_command.main(["my-env", "--follow"])
+
+    assert exc_info.value.code == 2
+
+
+def test_main_hosted_creates_expected_payload(monkeypatch):
+    monkeypatch.setattr(
+        eval_command,
+        "_load_prime_config",
+        lambda: {
+            "base_url": "https://api.primeintellect.ai",
+            "frontend_url": "https://app.primeintellect.ai",
+            "api_key": "test-api-key",
+        },
+    )
+
+    calls: list[dict[str, object]] = []
+
+    def fake_request_json(
+        method: str,
+        base_url: str,
+        endpoint: str,
+        api_key: str,
+        *,
+        json_payload=None,
+        timeout: float = 30.0,
+    ):
+        calls.append(
+            {
+                "method": method,
+                "base_url": base_url,
+                "endpoint": endpoint,
+                "api_key": api_key,
+                "payload": json_payload,
+                "timeout": timeout,
+            }
+        )
+        if endpoint.startswith("/environmentshub/"):
+            return {"data": {"id": "env-123"}}
+        if endpoint == "/hosted-evaluations":
+            return {
+                "evaluation_id": "eval-abc",
+                "viewer_url": "https://viewer/eval-abc",
+            }
+        raise AssertionError(f"unexpected endpoint: {endpoint}")
+
+    monkeypatch.setattr(eval_command, "_request_json", fake_request_json)
+
+    eval_command.main(
+        [
+            "primeintellect/gsm8k",
+            "--hosted",
+            "-m",
+            "openai/gpt-4.1-mini",
+            "-n",
+            "10",
+            "-r",
+            "2",
+            "-a",
+            '{"difficulty":"hard"}',
+            "--timeout-minutes",
+            "120",
+            "--allow-sandbox-access",
+            "--allow-instances-access",
+            "--custom-secrets",
+            '{"API_KEY":"secret"}',
+            "--eval-name",
+            "nightly-gsm8k",
+        ]
+    )
+
+    assert len(calls) == 2
+    assert calls[0]["endpoint"] == "/environmentshub/primeintellect/gsm8k/@latest"
+
+    payload = calls[1]["payload"]
+    assert payload == {
+        "environment_ids": ["env-123"],
+        "inference_model": "openai/gpt-4.1-mini",
+        "eval_config": {
+            "num_examples": 10,
+            "rollouts_per_example": 2,
+            "allow_sandbox_access": True,
+            "allow_instances_access": True,
+            "env_args": {"difficulty": "hard"},
+            "timeout_minutes": 120,
+            "custom_secrets": {"API_KEY": "secret"},
+        },
+        "name": "nightly-gsm8k",
+    }
+
+
+@pytest.mark.parametrize(
+    ("display_header", "expected_version"),
+    [
+        ("primeintellect/wordle", "latest"),
+        ("wordle (local - ahead of primeintellect/wordle)", "latest"),
+        ("primeintellect/wordle@2.0.0", "2.0.0"),
+    ],
+)
+def test_main_hosted_resolves_slug_from_display_header(
+    monkeypatch, display_header: str, expected_version: str
+):
+    monkeypatch.setattr(
+        eval_command,
+        "_load_prime_config",
+        lambda: {
+            "base_url": "https://api.primeintellect.ai",
+            "frontend_url": "https://app.primeintellect.ai",
+            "api_key": "test-api-key",
+        },
+    )
+
+    requested_endpoints: list[str] = []
+
+    def fake_request_json(
+        method: str,
+        base_url: str,
+        endpoint: str,
+        api_key: str,
+        *,
+        json_payload=None,
+        timeout: float = 30.0,
+    ):
+        requested_endpoints.append(endpoint)
+        if endpoint.startswith("/environmentshub/"):
+            return {"data": {"id": "env-456"}}
+        if endpoint == "/hosted-evaluations":
+            return {"evaluation_id": "eval-456"}
+        raise AssertionError(f"unexpected endpoint: {endpoint}")
+
+    monkeypatch.setattr(eval_command, "_request_json", fake_request_json)
+
+    eval_command.main(
+        [
+            "wordle",
+            "--hosted",
+            "--header",
+            f"X-Prime-Eval-Env-Display: {display_header}",
+        ]
+    )
+
+    assert (
+        requested_endpoints[0]
+        == f"/environmentshub/primeintellect/wordle/@{expected_version}"
+    )
+
+
+def test_main_hosted_supports_toml_config(monkeypatch, tmp_path):
+    config_path = tmp_path / "evals.toml"
+    config_path.write_text("[[eval]]\nenv_id='placeholder'\n", encoding="utf-8")
+
+    monkeypatch.setattr(
+        eval_command,
+        "_load_prime_config",
+        lambda: {
+            "base_url": "https://api.primeintellect.ai",
+            "frontend_url": "https://app.primeintellect.ai",
+            "api_key": "test-api-key",
+        },
+    )
+    monkeypatch.setattr(
+        eval_command,
+        "load_toml_config",
+        lambda _path: [
+            {
+                "env_id": "primeintellect/gsm8k",
+                "model": "openai/gpt-4.1-mini",
+                "num_examples": 10,
+                "rollouts_per_example": 2,
+                "env_args": {"difficulty": "hard"},
+            },
+            {
+                "env_id": "primeintellect/wordle@2.0.0",
+            },
+        ],
+    )
+    monkeypatch.setattr(
+        eval_command.vf_eval,
+        "get_env_eval_defaults",
+        lambda env_id: (
+            {"num_examples": 11, "rollouts_per_example": 5}
+            if env_id == "primeintellect/wordle@2.0.0"
+            else {}
+        ),
+    )
+
+    calls: list[dict[str, object]] = []
+
+    def fake_request_json(
+        method: str,
+        base_url: str,
+        endpoint: str,
+        api_key: str,
+        *,
+        json_payload=None,
+        timeout: float = 30.0,
+    ):
+        calls.append(
+            {
+                "method": method,
+                "base_url": base_url,
+                "endpoint": endpoint,
+                "api_key": api_key,
+                "payload": json_payload,
+                "timeout": timeout,
+            }
+        )
+        if endpoint == "/environmentshub/primeintellect/gsm8k/@latest":
+            return {"data": {"id": "env-1"}}
+        if endpoint == "/environmentshub/primeintellect/wordle/@2.0.0":
+            return {"data": {"id": "env-2"}}
+        if endpoint == "/hosted-evaluations":
+            created_count = len([call for call in calls if call["method"] == "POST"])
+            return {"evaluation_id": f"eval-{created_count}"}
+        raise AssertionError(f"unexpected endpoint: {endpoint}")
+
+    monkeypatch.setattr(eval_command, "_request_json", fake_request_json)
+
+    eval_command.main(
+        [
+            str(config_path),
+            "--hosted",
+            "--custom-secrets",
+            '{"API_KEY":"secret"}',
+        ]
+    )
+
+    post_payloads = [call["payload"] for call in calls if call["method"] == "POST"]
+    assert len(post_payloads) == 2
+    assert calls[0]["endpoint"] == "/environmentshub/primeintellect/gsm8k/@latest"
+    assert calls[2]["endpoint"] == "/environmentshub/primeintellect/wordle/@2.0.0"
+    assert post_payloads[0] == {
+        "environment_ids": ["env-1"],
+        "inference_model": "openai/gpt-4.1-mini",
+        "eval_config": {
+            "num_examples": 10,
+            "rollouts_per_example": 2,
+            "allow_sandbox_access": False,
+            "allow_instances_access": False,
+            "env_args": {"difficulty": "hard"},
+            "custom_secrets": {"API_KEY": "secret"},
+        },
+    }
+    assert post_payloads[1] == {
+        "environment_ids": ["env-2"],
+        "inference_model": eval_command.vf_eval.DEFAULT_MODEL,
+        "eval_config": {
+            "num_examples": 11,
+            "rollouts_per_example": 5,
+            "allow_sandbox_access": False,
+            "allow_instances_access": False,
+            "custom_secrets": {"API_KEY": "secret"},
+        },
+    }
diff --git a/tests/test_prime_plugin.py b/tests/test_prime_plugin.py
@@ -0,0 +1,87 @@
+from pathlib import Path
+
+import verifiers.cli.plugins.prime as prime_plugin
+
+
+def _make_workspace(tmp_path: Path) -> tuple[Path, Path]:
+    workspace = tmp_path / "workspace"
+    env_dir = workspace / "environments" / "my_env"
+    env_dir.mkdir(parents=True)
+    (workspace / "verifiers").mkdir()
+    (workspace / "pyproject.toml").write_text(
+        '[project]\nname = "workspace"\nversion = "0.1.0"\n',
+        encoding="utf-8",
+    )
+    return workspace, env_dir
+
+
+def _touch_python(venv_root: Path) -> Path:
+    python_bin = prime_plugin._venv_python(venv_root)
+    python_bin.parent.mkdir(parents=True, exist_ok=True)
+    python_bin.write_text("", encoding="utf-8")
+    return python_bin
+
+
+def test_find_workspace_root_from_nested_environment_dir(tmp_path: Path):
+    workspace, env_dir = _make_workspace(tmp_path)
+
+    assert prime_plugin._find_workspace_root(env_dir) == workspace
+
+
+def test_resolve_workspace_python_prefers_workspace_venv_over_uv_env(
+    tmp_path: Path, monkeypatch
+):
+    workspace, env_dir = _make_workspace(tmp_path)
+    workspace_python = _touch_python(workspace / ".venv")
+    _touch_python(env_dir / ".venv")
+
+    monkeypatch.setattr(prime_plugin, "_python_can_import_module", lambda *_: True)
+    monkeypatch.setenv("UV_PROJECT_ENVIRONMENT", str(env_dir / ".venv"))
+    monkeypatch.delenv("VIRTUAL_ENV", raising=False)
+
+    assert prime_plugin._resolve_workspace_python(env_dir) == str(workspace_python)
+
+
+def test_build_module_command_install_adds_workspace_env_path(
+    tmp_path: Path, monkeypatch
+):
+    workspace, env_dir = _make_workspace(tmp_path)
+    plugin = prime_plugin.PrimeCLIPlugin()
+
+    monkeypatch.setattr(prime_plugin, "_current_cwd", lambda: env_dir)
+    monkeypatch.setattr(prime_plugin, "_resolve_workspace_python", lambda *_: "python")
+
+    command = plugin.build_module_command(plugin.install_module, ["my-env"])
+
+    assert command == [
+        "python",
+        "-m",
+        plugin.install_module,
+        "my-env",
+        "--path",
+        str((workspace / "environments").resolve()),
+    ]
+
+
+def test_build_module_command_eval_rewrites_relative_env_dir_path(
+    tmp_path: Path, monkeypatch
+):
+    workspace, env_dir = _make_workspace(tmp_path)
+    plugin = prime_plugin.PrimeCLIPlugin()
+
+    monkeypatch.setattr(prime_plugin, "_current_cwd", lambda: env_dir)
+    monkeypatch.setattr(prime_plugin, "_resolve_workspace_python", lambda *_: "python")
+
+    command = plugin.build_module_command(
+        plugin.eval_module,
+        ["my-env", "--env-dir-path", "./environments"],
+    )
+
+    assert command == [
+        "python",
+        "-m",
+        plugin.eval_module,
+        "my-env",
+        "--env-dir-path",
+        str((workspace / "environments").resolve()),
+    ]