Skip to content
8 changes: 4 additions & 4 deletions configs/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,22 +85,22 @@
"key": "PRIME_API_KEY",
},
"qwen3-vl-30b-i": {
"model": "qwen/qwen3-30b-a3b-instruct-2507",
"model": "qwen/qwen3-vl-30b-a3b-instruct",
"url": "https://api.pinference.ai/api/v1",
"key": "PRIME_API_KEY",
},
"qwen3-vl-30b-t": {
"model": "qwen/qwen3-30b-a3b-thinking-2507",
"model": "qwen/qwen3-vl-30b-a3b-thinking",
"url": "https://api.pinference.ai/api/v1",
"key": "PRIME_API_KEY",
},
"qwen3-vl-235b-i": {
"model": "qwen/qwen3-235b-a22b-instruct-2507",
"model": "qwen/qwen3-vl-235b-a22b-instruct",
"url": "https://api.pinference.ai/api/v1",
"key": "PRIME_API_KEY",
},
"qwen3-vl-235b-t": {
"model": "qwen/qwen3-235b-a22b-thinking-2507",
"model": "qwen/qwen3-vl-235b-a22b-thinking",
"url": "https://api.pinference.ai/api/v1",
"key": "PRIME_API_KEY",
},
Expand Down
215 changes: 215 additions & 0 deletions configs/endpoints.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
[[endpoint]]
endpoint_id = "olmo3-32b-t"
model = "allenai/olmo-3-32b-think"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "olmo3-7b-i"
model = "allenai/olmo-3-7b-instruct"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "olmo3-7b-t"
model = "allenai/olmo-3-7b-think"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "trinity-mini"
model = "arcee/trinity-mini"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "haiku"
model = "anthropic/claude-4.5-haiku"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "sonnet"
model = "anthropic/claude-4.5-sonnet"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "opus"
model = "anthropic/claude-4.5-opus"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "gemini-2.5-flash"
model = "google/gemini-2.5-flash"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "gemini-2.5-pro"
model = "google/gemini-2.5-pro"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "gemini-3-flash"
model = "google/gemini-3-flash"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "gemini-3-pro"
model = "google/gemini-3-pro-preview"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "gemini-3-pro-exp"
model = "google/gemini-3-pro-preview"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "qwen3-30b-i"
model = "qwen/qwen3-30b-a3b-instruct-2507"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "qwen3-30b-t"
model = "qwen/qwen3-30b-a3b-thinking-2507"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "qwen3-235b-i"
model = "qwen/qwen3-235b-a22b-instruct-2507"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "qwen3-235b-t"
model = "qwen/qwen3-235b-a22b-thinking-2507"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "qwen3-vl-30b-i"
model = "qwen/qwen3-vl-30b-a3b-instruct"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "qwen3-vl-30b-t"
model = "qwen/qwen3-vl-30b-a3b-thinking"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "qwen3-vl-235b-i"
model = "qwen/qwen3-vl-235b-a22b-instruct"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "qwen3-vl-235b-t"
model = "qwen/qwen3-vl-235b-a22b-thinking"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "kimi-k2"
model = "moonshotai/kimi-k2-0905"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "kimi-k2-t"
model = "moonshotai/kimi-k2-thinking"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "gpt-oss-120b"
model = "openai/gpt-oss-120b"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "gpt-oss-20b"
model = "openai/gpt-oss-20b"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "gpt-4.1-nano"
model = "gpt-4.1-nano"
url = "https://api.openai.com/v1"
key = "OPENAI_API_KEY"

[[endpoint]]
endpoint_id = "gpt-4.1-mini"
model = "gpt-4.1-mini"
url = "https://api.openai.com/v1"
key = "OPENAI_API_KEY"

[[endpoint]]
endpoint_id = "gpt-4.1"
model = "gpt-4.1"
url = "https://api.openai.com/v1"
key = "OPENAI_API_KEY"

[[endpoint]]
endpoint_id = "gpt-5-nano"
model = "gpt-5-nano"
url = "https://api.openai.com/v1"
key = "OPENAI_API_KEY"

[[endpoint]]
endpoint_id = "gpt-5-mini"
model = "gpt-5-mini"
url = "https://api.openai.com/v1"
key = "OPENAI_API_KEY"

[[endpoint]]
endpoint_id = "gpt-5"
model = "gpt-5"
url = "https://api.openai.com/v1"
key = "OPENAI_API_KEY"

[[endpoint]]
endpoint_id = "gpt-5.1"
model = "gpt-5.1"
url = "https://api.openai.com/v1"
key = "OPENAI_API_KEY"

[[endpoint]]
endpoint_id = "gpt-5.2"
model = "gpt-5.2"
url = "https://api.openai.com/v1"
key = "OPENAI_API_KEY"

[[endpoint]]
endpoint_id = "glm-4.5"
model = "z-ai/glm-4.5"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "glm-4.5-air"
model = "z-ai/glm-4.5-air"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "glm-4.6"
model = "z-ai/glm-4.6"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"

[[endpoint]]
endpoint_id = "glm-4.7"
model = "z-ai/glm-4.7"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"
21 changes: 21 additions & 0 deletions configs/eval/multi-env.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
endpoints_path = "../endpoints.toml"

endpoint_id = "gpt-5-mini"
save_results = true
rollouts_per_example = 3

[[eval]]
env_id = "bfcl-v3"
num_examples = 100

[[eval]]
env_id = "tau2-bench"
num_examples = 100

[[eval]]
env_id = "wiki-search"
num_examples = 100

[[eval]]
env_id = "tool-test"
num_examples = 100
21 changes: 19 additions & 2 deletions docs/evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ prime eval run my-env -x '{"max_turns": 20}'
| `--model` | `-m` | `openai/gpt-4.1-mini` | Model name or endpoint alias |
| `--api-base-url` | `-b` | `https://api.pinference.ai/api/v1` | API base URL |
| `--api-key-var` | `-k` | `PRIME_API_KEY` | Environment variable containing API key |
| `--endpoints-path` | `-e` | `./configs/endpoints.py` | Path to endpoints registry |
| `--endpoints-path` | `-e` | `./configs/endpoints.toml` | Path to endpoints registry (`.toml` preferred, `.py` supported) |
| `--header` | — | — | Extra HTTP header (`Name: Value`), repeatable |

For convenience, define model endpoints in `./configs/endpoints.py` to avoid repeating URL and key flags:
For convenience, define model endpoints in `./configs/endpoints.toml` (or `./configs/endpoints.py`) to avoid repeating URL and key flags.

```python
ENDPOINTS = {
Expand All @@ -86,6 +86,18 @@ ENDPOINTS = {
}
```

Equivalent TOML format:

```toml
[[endpoint]]
endpoint_id = "gpt-4.1-mini"
model = "gpt-4.1-mini"
url = "https://api.openai.com/v1"
key = "OPENAI_API_KEY"
```

To define equivalent replicas, add multiple `[[endpoint]]` entries with the same `endpoint_id`.

Then use the alias directly:

```bash
Expand All @@ -94,6 +106,10 @@ prime eval run my-env -m qwen3-235b-i

If the model name is in the registry, those values are used by default, but you can override them with `--api-base-url` and/or `--api-key-var`. If the model name isn't found, the CLI flags are used (falling back to defaults when omitted).

In other words, `-m/--model` is treated as an endpoint alias lookup when present in the registry, and otherwise treated as a literal model id.

When using eval TOML configs, you can set `endpoint_id` in `[[eval]]` sections to resolve from the endpoint registry. `endpoint_id` is only supported when `endpoints_path` points to a TOML registry file.

### Sampling Parameters

| Flag | Short | Default | Description |
Expand Down Expand Up @@ -273,6 +289,7 @@ Each `[[eval]]` section must contain an `env_id` field. All other fields are opt
| `rollouts_per_example` | integer | Rollouts per example |
| `extra_env_kwargs` | table | Arguments passed to environment constructor |
| `model` | string | Model to evaluate |
| `endpoint_id` | string | Endpoint registry id (requires TOML `endpoints_path`) |

Example with `env_args`:

Expand Down
11 changes: 10 additions & 1 deletion docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ class GenerateMetadata(TypedDict):
tools: list[ChatCompletionToolParam] | None
```

`base_url` is always serialized as a string. For multi-endpoint runs (e.g., using `ClientConfig.endpoint_configs`), it is stored as a comma-separated list of URLs.

### RolloutScore / RolloutScores

```python
Expand Down Expand Up @@ -567,15 +569,19 @@ Combines rubrics for `EnvGroup`.

```python
class ClientConfig(BaseModel):
client_idx: int = 0
api_key_var: str = "PRIME_API_KEY"
api_base_url: str = "https://api.pinference.ai/api/v1"
endpoint_configs: list[ClientConfig] = []
timeout: float = 3600.0
max_connections: int = 28000
max_keepalive_connections: int = 28000
max_retries: int = 10
extra_headers: dict[str, str] = {}
```

Use `endpoint_configs` for multi-endpoint round-robin. In grouped scoring mode, groups are distributed round-robin across endpoint configs.

When `api_key_var` is `"PRIME_API_KEY"` (the default), credentials are loaded with the following precedence:
- **API key**: `PRIME_API_KEY` env var > `~/.prime/config.json` > `"EMPTY"`
- **Team ID**: `PRIME_TEAM_ID` env var > `~/.prime/config.json` > not set
Expand All @@ -589,6 +595,7 @@ class EvalConfig(BaseModel):
env_id: str
env_args: dict
env_dir_path: str
endpoint_id: str | None = None
model: str
client_config: ClientConfig
sampling_args: SamplingArgs
Expand All @@ -610,9 +617,11 @@ class EvalConfig(BaseModel):

```python
Endpoint = TypedDict("Endpoint", {"key": str, "url": str, "model": str})
Endpoints = dict[str, Endpoint]
Endpoints = dict[str, list[Endpoint]]
```

`Endpoints` maps an endpoint id to one or more endpoint variants. A single variant is represented as a one-item list.

---

## Decorators
Expand Down
Loading
Loading