gpu2grid · jaywonchung · May 11, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,5 @@ docs/api/
 
 data
 examples/*/outputs
+
+.claude/
diff --git a/README.md b/README.md
@@ -142,6 +142,8 @@ python examples/offline/run_ofo.py --system ieee13 --mode all
 
 `--system` selects the IEEE test feeder (ieee13, ieee34, or ieee123). `--mode` selects one of `baseline-no-tap`, `baseline-tap-change`, `ofo-no-tap`, `ofo-tap-change`, or `all`. Benchmark selection now lives directly in each `InferenceModelSpec`, and generated artifacts are cached per spec under `data/specs/<spec-hash>/`.
 
+A reinforcement-learning (PPO) controller is available as a self-contained example under `examples/rl_controller/`: see [Reinforcement Learning Controller (PPO)](https://gpu2grid.io/openg2g/examples/rl-controller/) for the build / train / evaluate workflow.
+
 ## Documentation
 
 Full documentation is available at [https://gpu2grid.io/openg2g](https://gpu2grid.io/openg2g), including:

diff --git a/_zensical.toml b/_zensical.toml
@@ -37,6 +37,7 @@ nav = [
   { "Examples" = [
     { "GPU Flexibility for Voltage Regulation" = "examples/gpu-flexibility.md" },
     { "Voltage Regulation Strategies" = "examples/voltage-regulation-strategies.md" },
+    { "Reinforcement Learning Controller (PPO)" = "examples/rl-controller.md" },
     { "Controller Parameter Sensitivity" = "examples/controller-parameter-sensitivity.md" },
     { "Grid Topology and DER Effects" = "examples/grid-topology-effects.md" },
     { "Datacenter Sizing and Hosting Capacity" = "examples/hosting-capacity.md" },

diff --git a/docs/examples/rl-controller.md b/docs/examples/rl-controller.md
@@ -0,0 +1,128 @@
+# Reinforcement Learning Controller (PPO)
+
+## Research Question
+
+Can a reinforcement learning policy learn to regulate distribution-feeder voltage by adjusting LLM batch sizes, and how does it compare to model-based approaches like OFO and rule-based control?
+
+## Overview
+
+The PPO workflow is built on top of [stable-baselines3](https://stable-baselines3.readthedocs.io/). Unlike OFO (which needs voltage sensitivity matrices and per-model logistic fits) and rule-based control (which only knows the worst violation magnitude), the PPO policy:
+
+- Reads a structured observation of the grid + datacenter state at each control tick.
+- Outputs a per-model batch-size action (delta or coupled, depending on the action mode).
+- Is trained against a per-step reward that combines voltage-violation penalty, throughput bonus, latency penalty, and a switching-cost term.
+
+The full RL workflow has three stages:
+
+1. **Build a scenario library**: a pool of pre-screened, randomized PV / TVL / inference-ramp scenarios that the PPO environment will sample from during training. Filtering the library to scenarios where OFO has meaningful headroom keeps the learning signal focused.
+2. **Train a PPO policy**: multi-million-step PPO run on the library scenarios, with checkpointing.
+3. **Evaluate**: replay held-out scenarios with baseline / rule-based / OFO / PPO controllers and compare voltage and throughput metrics.
+
+| Script | Purpose |
+|--------|---------|
+| `examples/rl_controller/build_library.py` | Generate, screen, and filter a per-system scenario library. |
+| `examples/rl_controller/train_ppo.py` | PPO training loop; saves model + VecNormalize stats. |
+| `examples/rl_controller/evaluate.py` | Compare baseline / OFO / rule-based / PPO on held-out test scenarios. |
+
+## Setup
+
+The RL workflow needs the `[opendss,rl]` extras:
+
+```bash
+pip install "openg2g[opendss,rl]"
+```
+
+> **Path convention.** All commands below are run from the **repo root**. Output-directory flags (`--tag`, `--output-dir`) are a **subdir name only**, joined under `examples/rl_controller/outputs/<system>/`. Input-artifact flags (`--scenario-library`, `--ppo-models`) accept arbitrary path strings, resolved against cwd.
+
+## Usage
+
+### IEEE 13: end-to-end example
+
+#### 1. Dataset generation
+
+**1a. Build training library**
+
+```bash
+python examples/rl_controller/build_library.py \
+    --system ieee13 \
+    --n-candidates 500 --seed-start 0 \
+    --tag train_n500
+```
+
+**1b. Build test library** (use a different `--seed-start` so train and test seeds don't overlap)
+
+```bash
+python examples/rl_controller/build_library.py \
+    --system ieee13 \
+    --n-candidates 150 --seed-start 1000 \
+    --tag test_n150
+```
+
+Each call writes a library directory with `metadata.json`, `traces.npz`, per-scenario voltage-envelope plots, and a `candidates.csv` of metrics. Acceptance rates around 50% are typical, so request roughly 2× the library size you want.
+
+#### 2. PPO training
+
+```bash
+python examples/rl_controller/train_ppo.py \
+    --system ieee13 \
+    --total-timesteps 2000000 \
+    --total-duration-s 3600 \
+    --n-steps 3600 \
+    --hidden-dims 128 128 128 \
+    --learning-rate 1e-4 \
+    --ent-coef 0.01 \
+    --action-mode delta \
+    --w-voltage 5000 --w-throughput 0.05 --w-latency 0.01 --w-switch 0.5 \
+    --n-envs 8 --seed 1 \
+    --scenario-library examples/rl_controller/outputs/ieee13/scenario_library/train_n500 \
+    --no-ofo-baseline \
+    --output-dir ppo
+```
+
+Output: `ppo_model.zip` (the trained policy) and `ppo_model_vecnormalize.pkl` (the obs-normalization stats, both required at inference), intermediate snapshots every 10 rollouts at `outputs/ieee13/ppo/checkpoints/_default/ppo_<N>_steps.zip` (the inner directory is the agent's site id, `_default` for single-DC systems; `<N>` is the total env-transitions count, i.e. `n_steps × n_envs × rollout_index`, so with the defaults you'll see ~7 snapshots over a 2M-step run), TensorBoard logs in `tb/`, and a training-progress plot. See `train_ppo.py --help` for the full flag set; pass `--no-tensorboard` to skip TB logging.
+
+#### 3. Controller evaluation
+
+Compares no-coordination baseline, droop (rule-based) control, OFO control, and the trained PPO on the held-out test library:
+
+```bash
+python examples/rl_controller/evaluate.py \
+    --system ieee13 \
+    --ppo-models examples/rl_controller/outputs/ieee13/ppo/ppo_model.zip \
+    --scenario-library examples/rl_controller/outputs/ieee13/scenario_library/test_n150 \
+    --n-scenarios 50 \
+    --obs-mode full-voltage \
+    --include-rule-based \
+    --use-display-names \
+    --output-dir eval_4ctrl_ieee13 \
+    --log-level INFO
+```
+
+Outputs (under `examples/rl_controller/outputs/ieee13/eval_4ctrl_ieee13/`):
+
+- `results.csv`: per-scenario metrics for every controller: violation time, integral violation, worst Vmin/Vmax, mean throughput, p99 latency, mean power, batch-change count.
+- `aggregate_*.png`: bar charts comparing voltage / throughput / batch-switching across controllers.
+- `scenario_<seed>/`: per-scenario voltage envelopes and batch-size traces.
+
+Multiple PPO checkpoints (e.g., intermediate snapshots from `checkpoints/<dc_id>/ppo_<N>_steps.zip` or runs from a multi-seed sweep) can be passed as space-separated arguments to `--ppo-models` and labelled via `--ppo-labels`.
+
+**Other feeders.** Repeat the same three stages for `ieee34` / `ieee123` with the right flags:
+
+- **`ieee34`**: pass `--no-randomize-ramps` to `build_library.py` (the default per-second ramp randomization tends to swamp the smaller load envelope on this feeder).
+- **`ieee123`**: this is the only feeder with named zones, so it's the only one where `--obs-mode per-zone-summary` is valid for `train_ppo.py` / `evaluate.py`. Passing it on `ieee13` / `ieee34` raises a `ValueError`.
+
+## What to Look For
+
+- **Voltage**: a well-trained PPO matches or slightly trails OFO on integral violation (pu·s).
+- **Throughput**: PPO usually serves *more* tokens per second than OFO because it's free to choose any feasible batch level rather than following gradient descent toward a fixed setpoint.
+- **Switching**: untrained or under-trained policies oscillate a lot (high `batch_chg`); the `--w-switch` term penalizes this. Compare PPO's `Batch Δ` column against OFO's to see whether the policy has learned a smooth control trajectory.
+- **Latency**: PPO can violate ITL deadlines if `--w-latency` is too small; turn it on if your application is latency-sensitive.
+
+## Configuration
+
+Most knobs are CLI flags on `train_ppo.py` (network architecture, optimizer, reward weights, scenario randomization). For deeper changes:
+
+- **Observation features and reward shape**: edit `ObservationConfig` / `RewardConfig` / `build_observation` / `compute_reward` in `examples/rl_controller/env.py`.
+- **Per-system experiments** (which models are served, replica counts and ramps, base loads, training overlay): edit the `ieee13_experiment` / `ieee34_experiment` / `ieee123_experiment` factories registered under `EXPERIMENTS` in `examples/rl_controller/scenarios.py`.
+
+See [Voltage Regulation Strategies](voltage-regulation-strategies.md) for a side-by-side comparison of PPO with the model-based controllers, and [Building Simulators](../guide/building-simulators.md) for the underlying API.
diff --git a/docs/examples/voltage-regulation-strategies.md b/docs/examples/voltage-regulation-strategies.md
@@ -8,11 +8,12 @@ How does datacenter-side batch-size control compare with grid-side regulator tap
 
 Distribution feeders traditionally regulate voltage using regulator tap changers: mechanical devices that adjust transformer turns ratios. Datacenter batch-size control offers a complementary demand-side approach: adjusting GPU workload parameters to modulate power consumption in real time.
 
-This analysis compares three control strategies:
+This analysis compares four control strategies:
 
 1. **Baseline with tap changes**: Traditional grid-side control only (regulator tap schedule, no batch adjustment)
 2. **Rule-based batch control**: Simple proportional controller that reduces batch on undervoltage and increases on overvoltage; no sensitivity matrix or model fits required
 3. **OFO batch control**: Primal-dual optimization using voltage sensitivity matrices and logistic curve fits for gradient-based batch adjustment
+4. **PPO batch control**: A reinforcement-learning policy (Proximal Policy Optimization) that maps a structured observation of grid + datacenter state to per-model batch-size actions. Requires a separate training run; see [Reinforcement Learning Controller (PPO)](rl-controller.md) for the end-to-end workflow.
 
 ## Scripts
 
@@ -23,6 +24,7 @@ This analysis compares three control strategies:
 | `run_ofo.py --mode ofo-no-tap` | OFO without tap schedule |
 | `run_ofo.py --mode ofo-tap-change` | OFO with tap schedule |
 | `analyze_different_controllers.py` | Side-by-side comparison of baseline, rule-based, and OFO |
+| `examples/rl_controller/evaluate.py` | Held-out scenario evaluation that also accepts trained PPO models via `--ppo-models` (see [Reinforcement Learning Controller (PPO)](rl-controller.md)) |
 
 ## Usage
 

diff --git a/docs/guide/building-simulators.md b/docs/guide/building-simulators.md
@@ -470,7 +470,7 @@ The following sections describe how the built-in components implement the interf
 
 ### `OpenDSSGrid`
 
-[`OpenDSSGrid`][openg2g.grid.opendss.OpenDSSGrid] implements [`GridBackend`][openg2g.grid.base.GridBackend] by wrapping a compiled OpenDSS circuit and running one power-flow solve per step. `start()` compiles the user's DSS file, builds the bus-phase voltage index, and adds native `Storage` elements for any attached storage (storage requires a three-phase bus). Each step writes datacenter, generator, external-load, and storage setpoints into OpenDSS, solves, and returns per-bus, per-phase voltages plus tap positions; each attached storage then receives a [`StorageState`][openg2g.grid.storage.StorageState] readback through `update_state()`. The grid accepts [`SetTaps`][openg2g.grid.command.SetTaps] and [`SetStoragePower`][openg2g.grid.command.SetStoragePower], and exposes `voltages_vector()` and a finite-difference `estimate_sensitivity()` (dV/dP) for controllers that need fine-grained voltage information — used by [`OFOBatchSizeController`](#ofobatchsizecontroller).
+[`OpenDSSGrid`][openg2g.grid.opendss.OpenDSSGrid] implements [`GridBackend`][openg2g.grid.base.GridBackend] by wrapping a compiled OpenDSS circuit and running one power-flow solve per step. `start()` compiles the user's DSS file, builds the bus-phase voltage index, and adds native `Storage` elements for any attached storage (storage requires a three-phase bus). Each step writes datacenter, generator, external-load, and storage setpoints into OpenDSS, solves, and returns per-bus, per-phase voltages plus tap positions; each attached storage then receives a [`StorageState`][openg2g.grid.storage.StorageState] readback through `update_state()`. The grid accepts [`SetTaps`][openg2g.grid.command.SetTaps] and [`SetStoragePower`][openg2g.grid.command.SetStoragePower], and exposes `voltages_vector()` and a finite-difference `estimate_sensitivity()` (dV/dP) for controllers that need fine-grained voltage information: used by [`OFOBatchSizeController`](#ofobatchsizecontroller).
 
 ### `BatteryStorage`
 
@@ -482,11 +482,11 @@ The following sections describe how the built-in components implement the interf
 
 ### `LoadShiftController`
 
-[`LoadShiftController`][openg2g.controller.load_shift.LoadShiftController] implements [`Controller`][openg2g.controller.base.Controller] for cross-site coordination. It holds references to all datacenters and, in a single step, emits paired [`ShiftReplicas`][openg2g.datacenter.command.ShiftReplicas] commands targeting different DCs (`-` at the source, `+` at the destination) — illustrating a controller that spans multiple datacenters and produces a coordinated multi-target action. It also has an ordering relationship with other controllers: its activation rule depends on whether per-site batch controllers are saturated, so it must run after them.
+[`LoadShiftController`][openg2g.controller.load_shift.LoadShiftController] implements [`Controller`][openg2g.controller.base.Controller] for cross-site coordination. It holds references to all datacenters and, in a single step, emits paired [`ShiftReplicas`][openg2g.datacenter.command.ShiftReplicas] commands targeting different DCs (`-` at the source, `+` at the destination): illustrating a controller that spans multiple datacenters and produces a coordinated multi-target action. It also has an ordering relationship with other controllers: its activation rule depends on whether per-site batch controllers are saturated, so it must run after them.
 
 ### `LocalVoltageStorageDroopController`
 
-[`LocalVoltageStorageDroopController`][openg2g.controller.storage.LocalVoltageStorageDroopController] implements [`Controller`][openg2g.controller.base.Controller] for storage rather than datacenters. It illustrates two patterns the other built-in controllers don't: targeting an [`EnergyStorage`][openg2g.grid.storage.EnergyStorage] (via [`SetStoragePower`][openg2g.grid.command.SetStoragePower]), and reading windowed grid *history* rather than just the current state — each step it consumes the history accumulated since the previous tick, reduces each storage's local samples by the configured `voltage_statistic`, and runs a deadbanded droop curve clipped to the storage rating. In Q-V mode the output drives kvar; in P-V mode it drives kW.
+[`LocalVoltageStorageDroopController`][openg2g.controller.storage.LocalVoltageStorageDroopController] implements [`Controller`][openg2g.controller.base.Controller] for storage rather than datacenters. It illustrates two patterns the other built-in controllers don't: targeting an [`EnergyStorage`][openg2g.grid.storage.EnergyStorage] (via [`SetStoragePower`][openg2g.grid.command.SetStoragePower]), and reading windowed grid *history* rather than just the current state: each step it consumes the history accumulated since the previous tick, reduces each storage's local samples by the configured `voltage_statistic`, and runs a deadbanded droop curve clipped to the storage rating. In Q-V mode the output drives kvar; in P-V mode it drives kW.
 
 ## Example Analysis Scripts
 

diff --git a/examples/model_insights/common.py b/examples/model_insights/common.py
@@ -13,7 +13,6 @@
 import csv
 import logging
 import math
-import sys
 from dataclasses import dataclass, field
 from fractions import Fraction
 from pathlib import Path
@@ -44,7 +43,6 @@
 from openg2g.metrics.performance import PerformanceStats, compute_performance_stats
 from openg2g.metrics.voltage import VoltageStats, compute_allbus_voltage_stats
 
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "offline"))
 from systems import SYSTEMS, tap
 
 logger = logging.getLogger("model_insights")
@@ -314,7 +312,7 @@ def _spec(
 }
 
 
-# Data pipeline — per-spec content-addressed cache (see InferenceModelSpec.cache_hash)
+# Data pipeline: per-spec content-addressed cache (see InferenceModelSpec.cache_hash)
 
 SPECS_CACHE_DIR = _PROJECT_ROOT / "data" / "specs"
 TRAINING_TRACE_PATH = _PROJECT_ROOT / "data" / "training_trace.csv"
@@ -558,7 +556,7 @@ def compute_achievable_power_range(
     DC power.
 
     The logistic is fit to `avg_power_watts`, the per-run average power for
-    the full `num_gpus` bench configuration — `model.eval(batch)` already
+    the full `num_gpus` bench configuration: `model.eval(batch)` already
     covers the whole replica, so do NOT multiply by `gpus_per_replica`.
     """
     total_max_w = 0.0
@@ -589,7 +587,7 @@ def compute_matched_peak_replicas(
     budget when GPU counts per replica differ.
 
     Args:
-        spec: The model spec — `feasible_batch_sizes` must already be capped
+        spec: The model spec: `feasible_batch_sizes` must already be capped
             to batches that meet the SLO; pass the output of
             `restrict_spec_by_deadline` if an SLO is in play.
         target_peak_kw: Target peak inference power in kW.
@@ -634,7 +632,7 @@ def deploy(
     """Shorthand for `(ModelDeployment, ReplicaSchedule(initial=...))`.
 
     `initial_batch_size=None` (the default) starts the scenario at the
-    largest feasible batch — i.e., maximum DC power stress, leaving the
+    largest feasible batch: i.e., maximum DC power stress, leaving the
     controller full downward range.
 
     Args:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,3 +13,5 @@ docs/api/

		data
		examples/*/outputs

		.claude/