PrimeIntellect-ai · willccbb · Feb 8, 2026 · Feb 8, 2026 · Feb 8, 2026 · Feb 8, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -35,8 +35,10 @@ prime lab setup
 This sets up a Python project if needed (with `uv init`), installs `verifiers` (with `uv add verifiers`), creates the recommended workspace structure, and downloads useful starter files:
 ```
 configs/
-├── endpoints.py        # OpenAI-compatible API endpoint configuration
-└── lab/                # Example configs for Hosted Training
+├── endpoints.toml      # OpenAI-compatible API endpoint configuration
+├── rl/                 # Example configs for Hosted Training
+├── eval/               # Example multi-environment eval configs
+└── gepa/               # Example configs for prompt optimization
 environments/
 └── AGENTS.md           # Documentation for AI coding agents
 AGENTS.md               # Top-level documentation for AI coding agents
@@ -90,7 +92,7 @@ To run a local evaluation with any OpenAI-compatible model, do:
 ```bash
 prime eval run my-env -m gpt-5-nano # run and save eval results locally
 ```
-Evaluations use [Prime Inference](https://docs.primeintellect.ai/inference/overview) by default; configure your own API endpoints in `./configs/endpoints.py`.
+Evaluations use [Prime Inference](https://docs.primeintellect.ai/inference/overview) by default; configure your own API endpoints in `./configs/endpoints.toml`.
 
 View local evaluation results in the terminal UI:
 ```bash

diff --git a/configs/eval/debug.toml b/configs/eval/debug.toml
@@ -1,6 +1,5 @@
 model = "openai/gpt-4.1-mini"
 save_results = true
-save_every = 10
 
 [[eval]]
 env_id = "primeintellect/wiki-search"
@@ -13,4 +12,4 @@ sampling_args = { max_tokens = 1024 }
 independent_scoring = true
 
 [[eval]]
-env_id = "alphabet-sort"
+env_id = "alphabet-sort"
diff --git a/configs/eval/tools.toml b/configs/eval/tools.toml
@@ -1,6 +1,5 @@
 model = "openai/gpt-5-mini"
 save_results = true
-save_every = 10
 
 [[eval]]
 env_id = "bfcl-v3"
@@ -10,7 +9,7 @@ rollouts_per_example = 3
 [[eval]]
 env_id = "tau2-bench"
 num_examples = 100
-rollouts_per_example = 3    
+rollouts_per_example = 3
 
 [[eval]]
 env_id = "wiki-search"
@@ -20,4 +19,4 @@ rollouts_per_example = 3
 [[eval]]
 env_id = "tool-test"
 num_examples = 100
-rollouts_per_example = 3
+rollouts_per_example = 3
diff --git a/configs/gepa/base.toml b/configs/gepa/base.toml
@@ -0,0 +1,21 @@
+model = "openai/gpt-4.1-mini"
+reflection_model = "openai/gpt-4.1-mini"
+endpoints_path = "../endpoints.toml"
+
+[env]
+env_id = "primeintellect/wiki-search"
+env_args = {}
+extra_env_kwargs = {}
+
+[gepa]
+max_calls = 500
+num_train = 100
+num_val = 50
+minibatch_size = 3
+# perfect_score = 1.0
+# state_columns = ["tool_calls"]
+
+[execution]
+max_concurrent = 32
+seed = 0
+# sampling_args = { max_tokens = 512, temperature = 0.7 }
diff --git a/configs/gepa/wordle.toml b/configs/gepa/wordle.toml
@@ -0,0 +1,21 @@
+model = "qwen3-30b-i"
+reflection_model = "qwen3-30b-i"
+endpoints_path = "../endpoints.toml"
+
+[env]
+env_id = "primeintellect/wordle"
+env_args = {}
+extra_env_kwargs = {}
+
+[gepa]
+max_calls = 500
+num_train = 100
+num_val = 50
+minibatch_size = 3
+perfect_score = 1.0
+state_columns = ["turn"]
+
+[execution]
+max_concurrent = 32
+seed = 0
+sampling_args = { max_tokens = 1024, temperature = 0.7 }
diff --git a/configs/lab/alphabet-sort.toml → configs/rl/alphabet-sort.toml b/configs/lab/alphabet-sort.toml → configs/rl/alphabet-sort.toml
diff --git a/configs/lab/gsm8k.toml → configs/rl/gsm8k.toml b/configs/lab/gsm8k.toml → configs/rl/gsm8k.toml
diff --git a/configs/lab/math-python.toml → configs/rl/math-python.toml b/configs/lab/math-python.toml → configs/rl/math-python.toml
diff --git a/configs/lab/reverse-text.toml → configs/rl/reverse-text.toml b/configs/lab/reverse-text.toml → configs/rl/reverse-text.toml
diff --git a/configs/lab/wiki-search.toml → configs/rl/wiki-search.toml b/configs/lab/wiki-search.toml → configs/rl/wiki-search.toml
diff --git a/configs/lab/wordle.toml → configs/rl/wordle.toml b/configs/lab/wordle.toml → configs/rl/wordle.toml
diff --git a/docs/development.md b/docs/development.md
@@ -48,7 +48,7 @@ verifiers/
 │   ├── rubrics/        # Rubric classes
 │   ├── rl/             # Training infrastructure
 │   │   ├── inference/  # vLLM server utilities
-│   │   └── trainer/    # RLTrainer implementation
+│   │   └── trainer/    # Trainer implementation
 │   ├── scripts/        # CLI entry points
 │   └── utils/          # Utilities
 ├── environments/       # Installable environment modules

diff --git a/docs/faqs.md b/docs/faqs.md
@@ -103,13 +103,6 @@ def relative_reward(completions: list, answers: list, **kwargs) -> list[float]:
 
 ## Training
 
-### What's the difference between `prime-rl` and `vf-rl`?
-
-- **prime-rl**: Production-ready, multi-node, MoE support, advanced features. Use for serious training.
-- **vf-rl**: Minimal (~1000 LOC), single-node, hackable. Use for small-scale testing or as a starting point for your own training loop.
-
-Both use the same core algorithm (async CISPO).
-
 ### How do I use a local vLLM server?
 
 Point the client to your local server:

diff --git a/docs/overview.md b/docs/overview.md
@@ -31,8 +31,10 @@ prime lab setup
 This sets up a Python project if needed (with `uv init`), installs `verifiers` (with `uv add verifiers`), creates the recommended workspace structure, and downloads useful starter files:
 ```
 configs/
-├── endpoints.py        # OpenAI-compatible API endpoint configuration
-└── lab/                # Example configs for Hosted Training
+├── endpoints.toml      # OpenAI-compatible API endpoint configuration
+├── rl/                 # Example configs for Hosted Training
+├── eval/               # Example multi-environment eval configs
+└── gepa/               # Example configs for prompt optimization
 environments/
 └── AGENTS.md           # Documentation for AI coding agents
 AGENTS.md               # Top-level documentation for AI coding agents
@@ -86,7 +88,7 @@ To run a local evaluation with any OpenAI-compatible model, do:
 ```bash
 prime eval run my-env -m gpt-5-nano # run and save eval results locally
 ```
-Evaluations use [Prime Inference](https://docs.primeintellect.ai/inference/overview) by default; configure your own API endpoints in `./configs/endpoints.py`.
+Evaluations use [Prime Inference](https://docs.primeintellect.ai/inference/overview) by default; configure your own API endpoints in `./configs/endpoints.toml`.
 
 View local evaluation results in the terminal UI:
 ```bash

diff --git a/docs/training.md b/docs/training.md
@@ -8,11 +8,7 @@ This section covers how to use Verifiers environments for RL training with our H
     - [Configuration](#configuration)
 - [Training with `prime-rl`](#training-with-prime-rl)
     - [Setup and Configuration](#setup-and-configuration)
-- [Training with `vf.RLTrainer`](#training-with-vfrltrainer)
-    - [Setup and Configuration](#setup-and-configuration)
-    - [Generation Parameters](#generation-parameters)
-    - [Training Schedule](#training-schedule)
-- [Prompt Optimization with `vf-gepa`](#prompt-optimization-with-vf-gepa)
+- [Prompt Optimization with `prime gepa run`](#prompt-optimization-with-prime-gepa-run)
     - [Usage](#usage)
     - [Output](#output)
 - [RL Rules of Thumb](#rl-rules-of-thumb)
@@ -37,17 +33,23 @@ Use the `prime lab setup` script to download example configuration files for Hos
 prime lab setup
 ```
 
-This will download example TOML configs for Hosted Training into `configs/lab/`, along with `endpoints.py`:
+This will download example TOML configs for Hosted Training into `configs/rl/`, example eval configs into `configs/eval/`, along with `endpoints.toml` and GEPA starter configs in `configs/gepa/`:
 
 ```
 configs/
-├── endpoints.py
-└── lab/
-    ├── alphabet-sort.toml
-    ├── gsm8k.toml
-    ├── math-python.toml
-    ├── reverse-text.toml
-    ├── wiki-search.toml
+├── endpoints.toml
+├── eval/
+│   ├── minimal.toml
+│   └── multi-env.toml
+├── rl/
+│   ├── alphabet-sort.toml
+│   ├── gsm8k.toml
+│   ├── math-python.toml
+│   ├── reverse-text.toml
+│   ├── wiki-search.toml
+│   └── wordle.toml
+└── gepa/
+    ├── base.toml
     └── wordle.toml
 ```
 
@@ -102,122 +104,15 @@ uv run prime-rl @ configs/prime-rl/wiki-search.toml
 
 This will launch a tmux session with separate panes for the trainer, orchestrator, and inference server. For further configuration options, see the [prime-rl documentation](https://docs.primeintellect.ai/prime-rl). 
 
-## Training with `vf.RLTrainer`
-
-> **Note:** `vf.RLTrainer` is intended for educational/demo purposes only and is not actively maintained. For production RL training, please use [`prime-rl`](#training-with-prime-rl) instead.
-
-If you want to hack on new training algorithms and are less concerned with maximum performance or advanced features, you can use the included `RLTrainer` (via `vf-rl`), whose core files are under 1000 lines of code and include only the most essential logic for fairly-performant async off-policy training (with a similar core algorithm as `prime-rl`).
-
-The included `RLTrainer` is a minimal, hackable training loop based on `transformers.Trainer` that supports both full-parameter finetuning and LoRA training. `RLTrainer` can be viewed as a "baby" `prime-rl` that adopts a similar default training recipe (async CISPO with one-step off-policy overlap), intended for single-node test runs with dense models. The primary files (`trainer.py` and `orchestrator.py`, located in `packages/verifiers-rl/verifiers_rl/rl/trainer/`) are under 1000 lines of code, and are designed to be a convenient starting point for writing your own training loop.
-
-The feature set is intentionally kept minimal and focused. Users seeking maximum performance, MoE support, multi-node training, multidimensional parallelism, and other advanced features should use the `prime-rl` trainer. 
-
-### Setup and Configuration
-
-To use `vf.RLTrainer` in your own project, install the optional RL package:
-```bash
-uv add verifiers-rl
-```
-
-Then, use the `vf-setup` script to download example configuration files for `vf.RLTrainer` into your workspace:
-
-```bash
-prime lab setup --vf-rl
-```
-This will download example TOML configs for `vf.RLTrainer` into `configs/vf-rl/`, along with `endpoints.py`:
-
-```
-configs/
-├── endpoints.py
-└── vf-rl/
-    ├── alphabet-sort.toml
-    ├── gsm8k.toml
-    ├── math-python.toml
-    ├── reverse-text.toml
-    ├── wiki-search.toml
-    └── wordle.toml
-```
-
-`vf-rl` can be used with a single TOML file, largely mirroring the configuration options for `prime-rl` but with some key differences in organization and feature sets.
-
-Example configuration file for the `primeintellect/wiki-search` Environment with `Qwen/Qwen3-4B-Instruct-2507`:
-
-```toml
-model = "Qwen/Qwen3-4B-Instruct-2507"
-
-[env]
-id = "primeintellect/wiki-search"
+## Prompt Optimization with `prime gepa run`
 
-[env.args]
-max_turns = 10
-
-[inference]
-gpus = 1
-
-[inference.args]
-enable_auto_tool_choice = true
-tool_call_parser = "hermes"
-
-[trainer]
-gpus = 1
-
-[trainer.args]
-run_name = "wiki-search"
-micro_batch_size = 4
-rollouts_per_example = 16
-batch_size = 1024
-max_steps = 500
-max_tokens = 512
-max_seq_len = 4096
-```
-
-To start a training run with `vf.RLTrainer`, do:
-
-```bash
-uv run vf-rl @ configs/vf-rl/wiki-search.toml
-```
-
-Key fields in `[trainer.args]`:
-- `rollouts_per_example`: completions per prompt (group size)
-- `micro_batch_size`: rollouts per GPU per step
-- `batch_size`: rollouts per global batch (must be divisible by `micro_batch_size * world_size`)
-
-**How to think about batch settings:**
-- `rollouts_per_example`: Larger groups (16-32) increase reward diversity but increase training time and memory usage
-- `micro_batch_size`: Limited by GPU memory after model weights
-- `batch_size`: Total rollouts per global batch (must be divisible by `micro_batch_size` and `rollouts_per_example`)
-
-### Generation Parameters
-
-Both `prime-rl` and `vf-rl` support configurable generation parameters, including:
-- `max_tokens`: maximum number of tokens to generate per turn
-- `temperature`: temperature for sampling
-- `top_p`: top-p sampling
-- `top_k`: top-k sampling
-- `min_p`: minimum probability for sampling
-- `repetition_penalty`: repetition penalty for sampling
-
-In `prime-rl`, these parameters are configured in the `[orchestrator.sampling]` section, and in `vf-rl`, they are configured in the `[trainer.args]` section.
-
-### Training Schedule
-
-Core fields in `[trainer.args]`:
-- `learning_rate`, `lr_scheduler_type`, `warmup_steps`, `max_steps`
-- `max_grad_norm`, `bf16`, `gradient_checkpointing`
-
-### Model loading
-
-By default, `vf.RLTrainer` will use Liger Kernel for optimized training. To disable Liger Kernel, set `use_liger = false` in `[trainer.args]`.
-
-## Prompt Optimization with `vf-gepa`
-
-`vf-gepa` is a CLI for automatic system prompt optimization using [GEPA](https://github.com/gepa-ai/gepa) (Genetic-Pareto prompt optimization). It iteratively refines your environment's system prompt using a teacher LLM to reflect on evaluation results, without requiring gradient-based training. Current support is for system prompt optimization only.
+`prime gepa run` is the CLI entrypoint for automatic system prompt optimization using [GEPA](https://github.com/gepa-ai/gepa) (Genetic-Pareto prompt optimization). It iteratively refines your environment's system prompt using a teacher LLM to reflect on evaluation results, without requiring gradient-based training. Current support is for system prompt optimization only.
 
 ### Usage
 
-Basic usage mirrors `vf-eval`:
+Basic usage mirrors `prime eval run`:
 ```bash
-vf-gepa wiki-search --model google/gemini-3-flash-preview
+prime gepa run wiki-search --model google/gemini-3-flash-preview
 ```
 
 This will optimize the system prompt for the `wiki-search` environment using the specified model for both evaluation rollouts and reflection. Results are saved to `environments/wiki-search/outputs/gepa/`.
@@ -288,6 +183,10 @@ The best way to improve training is to ensure appropriate task difficulty for yo
 
 `verifiers` is intended to be largely trainer-agnostic and is straightforward to support for any trainer which can expose an OpenAI-compatible inference client for rollouts.
 
+### `vf.RLTrainer` (Legacy)
+
+The legacy `vf.RLTrainer` still exists for educational and experimental purposes via the optional `verifiers-rl` package and the `vf-rl` entrypoint, but it is not actively maintained. It is a compact single-node async RL trainer with a narrower feature set than production trainers. Its core implementation (`trainer.py` and `orchestrator.py` under `packages/verifiers-rl/verifiers_rl/rl/trainer/`) remains intentionally lightweight for algorithm experimentation. For production training and current guidance, use [`prime-rl`](#training-with-prime-rl).
+
 ### Tinker
 
 [Tinker](https://thinkingmachines.ai/tinker/) supports Verifiers environments via the `tinker-cookbook` recipes.

diff --git a/environments/AGENTS.md b/environments/AGENTS.md
@@ -801,4 +801,4 @@ Newer and more experimental environment classes include:
 - **`GymEnv`** — universal runner for Gym-compatible environments (OpenAI Gym / Gymnasium API)
 - **`CliAgentEnv`** — runs custom agent code inside sandboxes, intercepting API requests. Accepts sandbox configuration parameters including `docker_image`, `cpu_cores`, `memory_gb`, `disk_size_gb`, `gpu_count`, `timeout_minutes`, `environment_vars`, and `labels` for sandbox categorization
 - **`HarborEnv`** — loads Harbor-format agent benchmark tasks
-- **`RLMEnv`** — implements Recursive Language Models for unbounded context processing. Execution supports both local and sandbox backends via `execution_backend` (`"local"` default, `"sandbox"` to run the REPL inside a Prime Sandbox). Context is still filesystem-based: a provided `context_dir` is copied into the working directory, or legacy JSON-serializable `context` data is written to `context.json`/`context.txt`. The RLM scaffolding prompt (filesystem availability note, REPL workflow, tool docs) is injected into the first user message wrapped in `<RLM_SCAFFOLDING>...</RLM_SCAFFOLDING>`, preserving any external system prompt; the model-visible prompt is stored in `state["prompt"]`, while the original input prompt is preserved in `state["raw_prompt"]`. The REPL language is configurable via `repl_language` (default: `bash`); use `repl_language="python"` to retain the Python REPL. Bash mode uses `call_bash_repl` and behaves like a terminal; Python mode uses `call_python_repl`. Sub-LLM and root-tool interception for sandboxes is routed through a Prime Tunnel unless `interception_url` is provided. Tooling can be split via `tools` (shared), `root_tools` (REPL-only), and `sub_tools` (sub-LLM tools). Fixed root tools like `llm_batch` are always present and cannot be overridden. Tool ordering is fixed tools → shared tools → role-specific tools, with per-list deduplication by name. Root tools are callable only inside the REPL; sub-LLM tools use standard tool-calling.
+- **`RLMEnv`** — implements Recursive Language Models for unbounded context processing. Execution supports both local and sandbox backends via `execution_backend` (`"local"` default, `"sandbox"` to run the REPL inside a Prime Sandbox). Context is still filesystem-based: a provided `context_dir` is copied into the working directory, or legacy JSON-serializable `context` data is written to `context.json`/`context.txt`. The RLM scaffolding prompt (filesystem availability note, REPL workflow, tool docs) is injected into the first user message wrapped in `<RLM_SCAFFOLDING>...</RLM_SCAFFOLDING>`, preserving any external system prompt; the model-visible prompt is stored in `state["prompt"]`, while the original input prompt is preserved in `state["raw_prompt"]`. The REPL language is configurable via `repl_language` (default: `bash`); use `repl_language="python"` to retain the Python REPL. Bash mode uses `call_bash_repl` and behaves like a terminal; Python mode uses `call_python_repl`. Sub-LLM and root-tool interception for sandboxes is routed through a Prime Tunnel unless `interception_url` is provided. Tooling can be split via `tools` (shared), `root_tools` (REPL-only), and `sub_tools` (sub-LLM tools). Fixed root tools like `llm_batch` are always present and cannot be overridden. Tool ordering is fixed tools → shared tools → role-specific tools, with per-list deduplication by name. Root tools are callable only inside the REPL; sub-LLM tools use standard tool-calling. When using the sandbox backend, the sandbox and worker are started eagerly during `setup_state`, and package installs are skipped when the package is already importable in the image. Environments can pre-set `state["rlm_fs_root_remote"]` (and optionally `state["rlm_control_dir_remote"]`) before calling `super().setup_state` to point the worker at an existing filesystem path in the sandbox. For further customization, override `get_sandbox_request`, `on_sandbox_ready`, or `customize_worker_script` on `RLMEnv`.
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
@@ -441,6 +441,15 @@ def test_load_toml_config_single_eval():
         assert result[0]["env_id"] == "env1"
 
 
+def test_repo_eval_example_configs_are_valid():
+    """Bundled example configs should parse with the current eval config schema."""
+    config_paths = sorted(Path("configs/eval").glob("*.toml"))
+    assert config_paths
+    for config_path in config_paths:
+        loaded = load_toml_config(config_path)
+        assert loaded, f"{config_path} should contain at least one [[eval]] section"
+
+
 def test_load_toml_config_multi_env():
     """Multiple envs load correctly."""
     with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f: