PrimeIntellect-ai · willccbb · Jan 22, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -135,7 +135,7 @@ The `--max-retries` flag enables automatic retry with exponential backoff when r
 | Flag | Short | Default | Description |
 |------|-------|---------|-------------|
 | `--verbose` | `-v` | false | Enable debug logging |
-| `--tui` | — | false | Show live-updating TUI for multi-env evals |
+| `--tui` | `-u` | false | Use alternate screen mode (TUI) for display |
 | `--save-results` | `-s` | false | Save results to disk |
 | `--save-every` | `-f` | -1 | Save checkpoint every N rollouts |
 | `--state-columns` | `-C` | — | Extra state columns to save (comma-separated) |

diff --git a/docs/training.md b/docs/training.md
@@ -12,6 +12,9 @@ This section covers how to use Verifiers environments for RL training with our H
     - [Setup and Configuration](#setup-and-configuration)
     - [Generation Parameters](#generation-parameters)
     - [Training Schedule](#training-schedule)
+- [Prompt Optimization with `vf-gepa`](#prompt-optimization-with-vf-gepa)
+    - [Usage](#usage)
+    - [Output](#output)
 - [RL Rules of Thumb](#rl-rules-of-thumb)
     - [Before Training](#before-training)
     - [Performance Trade-offs](#performance-trade-offs)
@@ -203,6 +206,38 @@ Core fields in `[trainer.args]`:
 
 By default, `vf.RLTrainer` will use Liger Kernel for optimized training. To disable Liger Kernel, set `use_liger = false` in `[trainer.args]`.
 
+## Prompt Optimization with `vf-gepa`
+
+`vf-gepa` is a CLI for automatic system prompt optimization using [GEPA](https://github.com/gepa-ai/gepa) (Genetic-Pareto prompt optimization). It iteratively refines your environment's system prompt using a teacher LLM to reflect on evaluation results, without requiring gradient-based training. Current support is for system prompt optimization only.
+
+### Usage
+
+Basic usage mirrors `vf-eval`:
+```bash
+vf-gepa wiki-search --model google/gemini-3-flash-preview
+```
+
+This will optimize the system prompt for the `wiki-search` environment using the specified model for both evaluation rollouts and reflection. Results are saved to `environments/wiki-search/outputs/gepa/`.
+
+Key options:
+- `--model` / `-m`: Model for evaluation rollouts
+- `--reflection-model` / `-M`: Teacher model for prompt reflection (defaults to `--model`)
+- `--max-calls` / `-B`: Evaluation budget (default: 500)
+- `--num-train` / `-n`: Training examples (default: 100)
+- `--num-val` / `-N`: Validation examples (default: 50)
+- `--minibatch-size`: Number of examples evaluated together per reflection step (default: 3)
+- `--perfect-score`: Maximum score for a rollout in your environment (if applicable); minibatches achieving this score are skipped during reflection (useful if your environment has a known max score)
+- `--state-columns`: Additional state columns to copy into the reflection dataset. By default, `query`, `completion`, `expected_answer`, `reward`, and `error` are included. Use this to add environment-specific state fields (e.g., `--state-columns tool_calls reasoning_trace`)
+
+### Output
+
+After optimization, you'll find:
+- `best_prompt.txt` - The optimized system prompt
+- `pareto_frontier.jsonl` - Best prompts per validation example
+- `metadata.json` - Run configuration and summary
+
+Use `vf-eval` to verify performance before and after optimization.
+
 ## RL Rules of Thumb
 
 RL training can be sensitive to implementation details and hyperparameters. Some simple practical guidance:

diff --git a/environments/alphabet_sort/alphabet_sort.py b/environments/alphabet_sort/alphabet_sort.py
@@ -1,5 +1,6 @@
 import difflib
 import json
+import logging
 import random
 import re
 from typing import List
@@ -8,6 +9,8 @@
 
 import verifiers as vf
 
+logger = logging.getLogger(__name__)
+
 
 def _extract_first_name(combined_name: str) -> str:
     """Extract first name from combined name like 'VladimirDrinfeld' -> 'Vladimir'"""
@@ -174,12 +177,8 @@ def get_random_turn_config():
                 )
 
             except Exception as e:
-                print(f"Error line {line_num}: {e}")
+                logger.error(f"Error line {line_num}: {e}")
 
-        print(
-            f"Dataset: {len(data)} examples with {min_turns}-{max_turns} turns, "
-            f"{min_names_per_turn}-{max_names_per_turn} names per turn"
-        )
         return Dataset.from_list(data)
 
     return build

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,7 @@ dependencies = [
     "tomli; python_version < '3.11'",
     "typing_extensions; python_version < '3.12'",
     "wget>=3.2",
+    "gepa"
 ]
 
 [dependency-groups]
@@ -91,6 +92,7 @@ flash-attn = { FLASH_ATTENTION_SKIP_CUDA_BUILD = "TRUE" }
 
 [project.scripts]
 vf-eval = "verifiers.scripts.eval:main"
+vf-gepa = "verifiers.scripts.gepa:main"
 vf-init = "verifiers.scripts.init:main"
 vf-install = "verifiers.scripts.install:main"
 vf-setup = "verifiers.scripts.setup:main"

diff --git a/verifiers/gepa/__init__.py b/verifiers/gepa/__init__.py
@@ -0,0 +1,12 @@
+from verifiers.gepa.adapter import VerifiersGEPAAdapter, make_reflection_lm
+from verifiers.gepa.gepa_utils import save_gepa_results
+from verifiers.gepa.config import GEPAConfig
+from verifiers.gepa.display import GEPADisplay
+
+__all__ = [
+    "VerifiersGEPAAdapter",
+    "GEPAConfig",
+    "GEPADisplay",
+    "make_reflection_lm",
+    "save_gepa_results",
+]
diff --git a/verifiers/gepa/adapter.py b/verifiers/gepa/adapter.py
@@ -0,0 +1,216 @@
+import asyncio
+import logging
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Callable, Mapping, Sequence
+
+from openai import AsyncOpenAI, OpenAI
+
+from gepa.core.adapter import EvaluationBatch
+
+from verifiers.envs.environment import Environment
+from verifiers.types import ClientConfig, Messages, RolloutInput, SamplingArgs, State
+from verifiers.utils.message_utils import message_to_printable, messages_to_printable
+
+if TYPE_CHECKING:
+    from verifiers.gepa.display import GEPADisplay
+
+logger = logging.getLogger(__name__)
+
+
+def make_reflection_lm(
+    client_config: ClientConfig,
+    model: str,
+    **kwargs: Any,
+) -> Callable[[str], str]:
+    """
+    Create a synchronous reflection LM callable for GEPA.
+
+    GEPA expects: reflection_lm(prompt: str) -> str
+    """
+    import os
+
+    client = OpenAI(
+        api_key=os.environ.get(client_config.api_key_var, ""),
+        base_url=client_config.api_base_url,
+        timeout=client_config.timeout,
+        max_retries=client_config.max_retries,
+    )
+
+    def reflection_lm(prompt: str) -> str:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": prompt}],
+            **kwargs,
+        )
+        content = response.choices[0].message.content
+        return content or ""
+
+    return reflection_lm
+
+
+@dataclass
+class VerifiersGEPAAdapter:
+    """Bridges GEPA optimization loop with verifiers evaluation infrastructure."""
+
+    env: Environment
+    client: AsyncOpenAI
+    model: str
+    sampling_args: SamplingArgs | None = None
+    max_concurrent: int = 32
+    state_columns: list[str] = field(default_factory=list)
+
+    # Optional display for progress updates
+    display: "GEPADisplay | None" = None
+
+    # GEPA adapter protocol: None means use default proposer with reflection_lm
+    propose_new_texts: Callable[..., dict[str, str]] | None = None
+
+    # Display control
+    use_tqdm: bool = False
+
+    # Internal: track candidates by prompt hash
+    _seen_prompts: dict[str, int] = field(default_factory=dict)
+
+    def evaluate(
+        self,
+        batch: list[RolloutInput],
+        candidate: dict[str, str],
+        capture_traces: bool = False,
+    ) -> EvaluationBatch[State, dict[str, Any]]:
+        """
+        Run verifiers evaluation with the candidate system prompt.
+        """
+        inputs = _inject_system_prompt(batch, candidate.get("system_prompt", ""))
+
+        results = asyncio.get_event_loop().run_until_complete(
+            self.env.generate(
+                inputs=inputs,
+                client=self.client,
+                model=self.model,
+                sampling_args=self.sampling_args,
+                max_concurrent=self.max_concurrent,
+                use_tqdm=self.use_tqdm,
+            )
+        )
+
+        n_examples = len(results["reward"])
+        outputs: list[dict[str, Any]] = []
+        for i in range(n_examples):
+            outputs.append({
+                "prompt": results["prompt"][i],
+                "completion": results["completion"][i],
+                "answer": results["answer"][i],
+                "reward": results["reward"][i],
+                "example_id": results["example_id"][i],
+            })
+
+        # Update display if configured
+        if self.display is not None:
+            prompt_text = candidate.get("system_prompt", "")
+            if prompt_text not in self._seen_prompts:
+                self._seen_prompts[prompt_text] = len(self._seen_prompts)
+            candidate_idx = self._seen_prompts[prompt_text]
+
+            self.display.update_eval(
+                candidate_idx=candidate_idx,
+                scores=results["reward"],
+                example_ids=results["example_id"],
+                capture_traces=capture_traces,
+            )
+
+        return EvaluationBatch(
+            outputs=outputs,
+            scores=results["reward"],
+            trajectories=results["state"] if capture_traces else None,
+        )
+
+    def make_reflective_dataset(
+        self,
+        candidate: dict[str, str],  # noqa: ARG002 - required by GEPA adapter protocol
+        eval_batch: EvaluationBatch[State, dict[str, Any]],
+        components_to_update: list[str],
+    ) -> Mapping[str, Sequence[Mapping[str, Any]]]:
+        """Build reflective dataset for GEPA teacher LLM."""
+        outputs: list[dict[str, Any]] = eval_batch.outputs
+        states: list[State] = eval_batch.trajectories or []
+        scores = eval_batch.scores
+
+        records = []
+        # outputs, states, and scores should be the same length
+        for output, state, score in zip(outputs, states, scores):
+            record: dict[str, Any] = {
+                "query": _extract_user_query(output["prompt"]),
+                "completion": messages_to_printable(output["completion"]),
+                "expected_answer": output["answer"],
+                "reward": score,
+            }
+
+            if state.get("error"):
+                record["error"] = repr(state["error"])
+
+            if state.get("stop_condition"):
+                record["stop_condition"] = state["stop_condition"]
+
+            for col in self.state_columns:
+                if col in state:
+                    record[col] = _serialize(state[col])
+
+            records.append(record)
+
+        return {comp: records for comp in components_to_update}
+
+
+def _inject_system_prompt(
+    inputs: list[RolloutInput],
+    system_prompt: str,
+) -> list[RolloutInput]:
+    """Inject or replace system prompt in each input's prompt."""
+    if not system_prompt:
+        return inputs
+
+    modified = []
+    for inp in inputs:
+        inp_copy = dict(inp)
+        prompt = inp_copy.get("prompt", [])
+
+        if isinstance(prompt, str):
+            inp_copy["prompt"] = f"{system_prompt}\n\n{prompt}"
+        else:
+            prompt = [dict(m) for m in prompt]
+            if not prompt:
+                # Empty prompt list - just add system message
+                prompt = [{"role": "system", "content": system_prompt}]
+            elif prompt[0].get("role") == "system":
+                prompt[0] = {**prompt[0], "content": system_prompt}
+            else:
+                prompt = [{"role": "system", "content": system_prompt}] + prompt
+            inp_copy["prompt"] = prompt
+
+        modified.append(inp_copy)
+    return modified
+
+
+def _extract_user_query(prompt: Messages) -> str:
+    """Extract user query from prompt, skipping system message."""
+    if isinstance(prompt, str):
+        return prompt
+    for msg in prompt:
+        if msg.get("role") == "user":
+            content = message_to_printable(msg).get("content", "")
+            if isinstance(content, str):
+                return content
+            return str(content) if content else ""
+    return ""
+
+
+def _serialize(value: Any) -> Any:
+    """Make value JSON-serializable."""
+    if hasattr(value, "model_dump"):
+        return value.model_dump()
+    if isinstance(value, list):
+        return [_serialize(v) for v in value]
+    if isinstance(value, dict):
+        return {k: _serialize(v) for k, v in value.items()}
+    if isinstance(value, Exception):
+        return repr(value)
+    return value
diff --git a/verifiers/gepa/config.py b/verifiers/gepa/config.py
@@ -0,0 +1,42 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from verifiers.types import ClientConfig, SamplingArgs
+
+
+@dataclass
+class GEPAConfig:
+    """Configuration for GEPA optimization."""
+
+    # Environment
+    env_id: str
+    env_args: dict = field(default_factory=dict)
+
+    # Models
+    model: str = ""  # Model for rollouts
+    reflection_model: str | None = None  # Model for reflection (defaults to model)
+    client_config: ClientConfig = field(default_factory=ClientConfig)
+
+    # Dataset sizes
+    num_train_examples: int = 100
+    num_val_examples: int = 50
+
+    # GEPA optimization
+    max_metric_calls: int = 500
+    reflection_minibatch_size: int = 3
+    initial_prompt: str | None = None  # None = use env.system_prompt
+
+    # Reflective dataset
+    state_columns: list[str] = field(default_factory=list)
+
+    # Execution
+    sampling_args: SamplingArgs = field(default_factory=dict)
+    max_concurrent: int = 32
+
+    # Output
+    run_dir: Path | None = None
+    seed: int = 0
+    verbose: bool = False
+
+    # Saving
+    save_results: bool = True  # Save final results to disk