PufferAI · pjo256 · Jan 31, 2026 · Jan 31, 2026 · Feb 1, 2026 · Feb 1, 2026
diff --git a/pufferlib/config/ocean/orbital_dock.ini b/pufferlib/config/ocean/orbital_dock.ini
@@ -0,0 +1,76 @@
+[base]
+package = ocean
+env_name = puffer_orbital_dock
+policy_name = OrbitalDock
+rnn_name = Recurrent
+
+[policy]
+hidden_size = 256
+
+[rnn]
+input_size = 256
+hidden_size = 256
+
+[vec]
+num_envs = 4
+
+[env]
+num_envs = 1024
+mu = 3.986e14
+# GEO orbit: R = 42,164 km
+station_radius = 42164000.0
+dt = 1.0
+max_thrust = 10.0
+mass = 500.0
+fuel_budget = 100.0
+max_steps = 2500
+# Docking point ([0, 60, 0] in LVLH)
+dock_x = 0.0
+dock_y = 60.0
+dock_z = 0.0
+# Docking thresholds (10m, 2 m/s)
+dock_dist = 10.0
+dock_speed = 2.0
+# Speed annealing: dock_speed_start -> dock_speed over anneal_steps per-env steps
+dock_speed_start = 10.0
+anneal_steps = 50000
+# LOS cone (60 deg total, 800m extent)
+los_angle = 60.0
+los_extent = 800.0
+# Initial conditions
+init_x_center = 0.0
+init_y_center = 800.0
+init_z_center = 0.0
+init_x_range = 400.0
+init_y_range = 300.0
+init_z_range = 400.0
+
+[train]
+learning_rate = 0.001
+# 4096 total agents × 16 bptt = 65536
+batch_size = 65536
+minibatch_size = 4096
+update_epochs = 2
+gamma = 0.998
+gae_lambda = 0.98
+clip_coef = 0.06
+ent_coef = 0.005
+vf_coef = 0.5
+vf_clip_coef = 0.5
+max_grad_norm = 0.5
+anneal_lr = true
+min_lr_ratio = 0.1
+total_timesteps = 500_000_000
+checkpoint_interval = 200
+bptt_horizon = 16
+use_rnn = false
+
+[sweep]
+downsample = 0
+
+[sweep.train.total_timesteps]
+distribution = log_normal
+min = 5e7
+max = 2e8
+mean = 1e8
+scale = time
diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py
@@ -162,6 +162,7 @@ def make_multiagent(buf=None, **kwargs):
     'spaces': make_spaces,
     'multiagent': make_multiagent,
     'slimevolley': 'SlimeVolley',
+    'orbital_dock': 'OrbitalDock',
 }
 
 def env_creator(name='squared', *args, **kwargs):

diff --git a/pufferlib/ocean/orbital_dock/README.md b/pufferlib/ocean/orbital_dock/README.md
@@ -0,0 +1,71 @@
+# Orbital Dock
+
+Spacecraft rendezvous and docking in the LVLH (Hill) reference frame. A chaser spacecraft must navigate 400–1200m to reach a docking port located 60m along the station's V-bar (prograde) axis. The approach is constrained to a 60° line-of-sight cone centered on the docking axis — the chaser cannot dock from arbitrary directions but must align with the V-bar corridor. A hard speed gate (2.0 m/s) enforces a controlled final approach. The physics use Clohessy-Wiltshire linearized relative motion with RK4 integration — the standard model for proximity operations in circular orbit.
+
+PPO achieves a **99.5% dock rate** against a 2.0 m/s speed gate at 500M training steps.
+
+## Physics
+
+- **Dynamics:** Clohessy-Wiltshire (CW) equations, RK4 integration, dt = 1s
+- **Orbit:** GEO (42,164 km altitude), mean motion n = 7.29e-5 rad/s
+- **Thrust:** 10N max per axis, 500 kg chaser, max accel = 0.02 m/s²
+- **Fuel:** 100 m/s delta-v budget
+- **Frame:** LVLH — R-bar (radial), V-bar (prograde), H-bar (normal)
+
+## Observations (10D)
+
+| Index | Description |
+|-------|-------------|
+| 0-2 | Position [x, y, z] in LVLH (meters) |
+| 3-5 | Velocity [vx, vy, vz] in LVLH (m/s) |
+| 6 | Distance to dock point (m) |
+| 7 | Speed (m/s) |
+| 8 | Closing velocity (m/s, positive = approaching) |
+| 9 | Time remaining (fraction) |
+
+## Actions
+
+Continuous `Box([-1, 1], shape=(3,))` — thrust fraction per LVLH axis (R-bar, V-bar, H-bar).
+
+## Termination
+
+- **Dock:** in LOS cone AND dist < 10m AND speed < 2.0 m/s
+- **Crash:** y < dock_y - 5m (overshoot past dock point)
+- **Timeout:** 2500 steps
+
+## Reward
+
+Per-step:
+- `+0.01 * (prev_dist - dist)` — distance progress
+- `+0.1 * (exp(-dist/20) - exp(-prev_dist/20))` — potential-based proximity
+- `-0.005 * exp(-dist/30) * speed²` — proximity-weighted braking
+- `-0.005 * (act²)` — control cost
+- `-0.005` — time penalty
+
+Terminal: dock = +0.5 to +1.0 (soft speed bonus), crash = -1.0, timeout = -0.5
+
+## Reward Shaping
+
+**Exponential proximity reward.** With linear distance progress (`prev_dist - dist`) over long distances, the agent learns to approach but won't risk crashing. The crash penalty outweighs the marginal progress reward near the dock, so the agent avoids docking. An exponential proximity bonus (`exp(-dist/20)`) pulls the agent to explore through the final approach.
+
+**Potential-based formulation.** A naive proximity bonus (`+0.1 * exp(-dist/20)`) is farmable — the agent can park at ~20m from the dock and collect +0.02/step indefinitely, earning more per episode than the +1.0 dock terminal reward. The potential-based version (`exp(-dist/20) - exp(-prev_dist/20)`) only rewards *movement toward* the dock. Hovering pays zero.
+
+**Proximity-weighted braking** (`exp(-dist/30) * speed²`) solves the final piece: speed control. Global velocity penalties create a "50m wall" where the agent learns to stop far from the dock to avoid the per-step speed cost. Weighting by proximity focuses the braking signal where it matters — at 800m the penalty is negligible, at 30m it's moderate, at 10m it's strong. The agent cruises at full speed for most of the approach and only brakes in the final stretch.
+
+**Velocity annealing** Annealing the velocity requirements for a successful dock (from 10 → 2 m/s over 50K per-env steps) bootstraps the dock reward signal. Without annealing, the agent never experiences docking and can't learn the goal. Short annealing gives brief exposure to successful trajectories, with the braking reward pushing gradients as the task becomes more difficult.
+
+## Architecture
+
+- Actor: 2-layer MLP (10 → 256 → 256 → 3), GELU, logstd init = -1.0
+- Critic: separate 2-layer MLP (10 → 256 → 256 → 1)
+- RunningNorm on observations, no RNN
+- 664.8K parameters
+
+Training: lr=0.001, clip=0.06, ent=0.005, epochs=2, gamma=0.998, gae=0.98, 1024 envs, batch=65536, minibatch=4096, 500M steps.
+
+## Files
+
+- `orbital_dock.h` — C environment (CW dynamics, reward, physics)
+- `orbital_dock.py` — Python PufferEnv wrapper
+- `binding.c` — C-Python binding
+- `render.h` — Raylib 3D visualization
diff --git a/pufferlib/ocean/orbital_dock/binding.c b/pufferlib/ocean/orbital_dock/binding.c
@@ -0,0 +1,55 @@
+#include "orbital_dock.h"
+#include "render.h"
+
+#define Env OrbitalDock
+#include "../env_binding.h"
+
+static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
+    env->client = NULL;
+
+    // Physics parameters
+    env->mu = unpack(kwargs, "mu");
+    env->station_radius = unpack(kwargs, "station_radius");
+    env->dt = unpack(kwargs, "dt");
+    env->max_thrust = unpack(kwargs, "max_thrust");
+    env->mass = unpack(kwargs, "mass");
+    env->fuel_budget = unpack(kwargs, "fuel_budget");
+    env->max_steps = (int)unpack(kwargs, "max_steps");
+
+    // Docking point ([0, 60, 0])
+    env->dock_x = unpack(kwargs, "dock_x");
+    env->dock_y = unpack(kwargs, "dock_y");
+    env->dock_z = unpack(kwargs, "dock_z");
+    env->dock_dist = unpack(kwargs, "dock_dist");
+    env->dock_speed = unpack(kwargs, "dock_speed");
+    env->dock_speed_start = unpack(kwargs, "dock_speed_start");
+    env->anneal_steps = (int)unpack(kwargs, "anneal_steps");
+    env->global_step = 0;
+
+    // LOS cone
+    double los_angle_deg = unpack(kwargs, "los_angle");
+    env->los_half_angle = (los_angle_deg / 2.0) * M_PI / 180.0;
+    env->los_extent = unpack(kwargs, "los_extent");
+
+    // Initial condition ranges
+    env->init_x_center = unpack(kwargs, "init_x_center");
+    env->init_y_center = unpack(kwargs, "init_y_center");
+    env->init_z_center = unpack(kwargs, "init_z_center");
+    env->init_x_range = unpack(kwargs, "init_x_range");
+    env->init_y_range = unpack(kwargs, "init_y_range");
+    env->init_z_range = unpack(kwargs, "init_z_range");
+
+    return 0;
+}
+
+static int my_log(PyObject* dict, Log* log) {
+    assign_to_dict(dict, "episode_return", log->episode_return);
+    assign_to_dict(dict, "episode_length", log->episode_length);
+    assign_to_dict(dict, "dock_success", log->dock_success);
+    assign_to_dict(dict, "crash_rate", log->crash_rate);
+    assign_to_dict(dict, "timeout_rate", log->timeout_rate);
+    assign_to_dict(dict, "fuel_used", log->fuel_used);
+    assign_to_dict(dict, "final_distance", log->final_distance);
+    assign_to_dict(dict, "final_rel_speed", log->final_rel_speed);
+    return 0;
+}
diff --git a/pufferlib/ocean/orbital_dock/orbital_dock.c b/pufferlib/ocean/orbital_dock/orbital_dock.c
@@ -0,0 +1,75 @@
+// Standalone C demo for orbital_dock environment
+// Compile using: ./scripts/build_ocean.sh orbital_dock [local|fast]
+// Run with: ./orbital_dock
+
+#include "orbital_dock.h"
+#include "render.h"
+#include <time.h>
+
+void generate_random_actions(OrbitalDock *env) {
+    env->actions[0] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f;
+    env->actions[1] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f;
+    env->actions[2] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f;
+}
+
+int main() {
+    srand(time(NULL));
+
+    OrbitalDock *env = calloc(1, sizeof(OrbitalDock));
+
+    // Physics parameters (GEO orbit)
+    env->mu = 3.986e14;
+    env->station_radius = 42164e3;
+    env->dt = 1.0;
+    env->max_thrust = 10.0;
+    env->mass = 500.0;
+    env->fuel_budget = 100.0;
+    env->max_steps = 2500;
+
+    // Docking point
+    env->dock_x = 0.0;
+    env->dock_y = 60.0;
+    env->dock_z = 0.0;
+    env->dock_dist = 10.0;
+    env->dock_speed = 2.0;
+    env->dock_speed_start = 10.0;
+    env->anneal_steps = 50000;
+    env->global_step = 0;
+
+    // LOS cone (60 deg total)
+    env->los_half_angle = 30.0 * M_PI / 180.0;
+    env->los_extent = 800.0;
+
+    // Initial condition ranges
+    env->init_x_center = 0.0;
+    env->init_y_center = 800.0;
+    env->init_z_center = 0.0;
+    env->init_x_range = 400.0;
+    env->init_y_range = 300.0;
+    env->init_z_range = 400.0;
+
+    // Allocate buffers
+    env->observations = (float *)calloc(10, sizeof(float));
+    env->actions = (float *)calloc(3, sizeof(float));
+    env->rewards = (float *)calloc(1, sizeof(float));
+    env->terminals = (unsigned char *)calloc(1, sizeof(unsigned char));
+    env->client = NULL;
+
+    c_reset(env);
+    c_render(env);
+
+    while (!WindowShouldClose()) {
+        generate_random_actions(env);
+        c_step(env);
+        c_render(env);
+    }
+
+    c_close(env);
+    free(env->observations);
+    free(env->actions);
+    free(env->rewards);
+    free(env->terminals);
+    free(env);
+
+    return 0;
+}