Emerge-Lab · charliemolony · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/pufferlib/config/ocean/adaptive.ini b/pufferlib/config/ocean/adaptive.ini
@@ -19,7 +19,7 @@ input_size = 256
 hidden_size = 256
 
 [env]
-num_agents = 1024
+num_agents = 8192
 ; Options: discrete, continuous
 action_type = discrete
 ; Options: classic, jerk
@@ -49,10 +49,22 @@ num_maps = 1000
 init_steps = 0
 ; Options: "control_vehicles", "control_agents", "control_tracks_to_predict"
 control_mode = "control_vehicles"
+;
+max_controlled_agents = 16
 ; Options: "created_all_valid", "create_only_controlled"
 init_mode = "create_all_valid"
+;
+create_expert_overflow = False
+; train with co players, co players must be defined below if this is true
+co_player_enabled = False
+; Train with only one ego agent per world, If trye then num_agents must be equal to max_controlled agents * num_ego agents
+one_ego_per_scene = False
+; Total number of agents training across all worlds
+num_ego_agents = 512
+;
+
 
-[policy.conditioning]
+[env.conditioning]
 ; Options: "none", "reward", "entropy", "discount", "all"
 type = "none"
 collision_weight_lb = -1.0
@@ -66,16 +78,19 @@ entropy_weight_ub = 0.001
 discount_weight_lb = 0.98
 discount_weight_ub = 0.80
 
-[co_player_policy]
+[env.co_player_policy]
 enabled = True
-num_ego_agents = 512
 policy_name = Drive
 rnn_name = Recurrent
 policy_path = "resources/drive/policies/varied_discount.pt"
 input_size = 64
 hidden_size = 256
 
-[co_player_policy.conditioning]
+[env.co_player_policy.rnn]
+input_size = 256
+hidden_size = 256
+
+[env.co_player_policy.conditioning]
 ; Options: "none", "reward", "entropy", "discount", "all"
 type = "all"
 collision_weight_lb = -1.0
@@ -86,22 +101,19 @@ goal_weight_lb = 0.0
 goal_weight_ub = 1.0
 entropy_weight_lb = 0.0
 entropy_weight_ub = 0.001
-discount_weight_lb = 0.98
-discount_weight_ub = 0.80
+discount_weight_lb = 0.80
+discount_weight_ub = 0.98
 
-[co_player_rnn]
-input_size = 256
-hidden_size = 256
 
 [train]
-total_timesteps = 2_000_000_000
+total_timesteps = 3_000_000_000
 # learning_rate = 0.02
 # gamma = 0.985
 anneal_lr = True
 ; Needs to be: num_agents * num_workers * BPTT horizon
 batch_size = auto
 minibatch_size = 372736
-minibatch_multiplier = 512
+minibatch_multiplier = 256
 max_minibatch_size = 372736
 ; BPTT horizon (overridden by pufferl.py for adaptive agents to k_scenarios * scenario_length)
 bptt_horizon = 32

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
@@ -49,7 +49,7 @@ control_mode = "control_vehicles"
 ; Options: "created_all_valid", "create_only_controlled"
 init_mode = "create_all_valid"
 
-[policy.conditioning]
+[env.conditioning]
 ; Options: "none", "reward", "entropy", "discount", "all"
 type = "none"
 collision_weight_lb = -1.0

diff --git a/pufferlib/models.py b/pufferlib/models.py
@@ -134,7 +134,9 @@ def forward_eval(self, observations, state):
 
         # TODO: Don't break compile
         if h is not None:
-            assert h.shape[0] == c.shape[0] == observations.shape[0], "LSTM state must be (h, c)"
+            assert h.shape[0] == c.shape[0] == observations.shape[0], (
+                f"LSTM state must be (h, c), h shape {h.shape[0]}, observations shape: {observations.shape[0]}"
+            )
             lstm_state = (h, c)
         else:
             lstm_state = None

diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c
@@ -167,15 +167,42 @@ static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
             env->ego_agent_ids = NULL;
             env->num_ego_agents = 0;
         }
+
+        // Handle placeholder agents
+        env->num_place_holders = unpack(kwargs, "num_place_holders");
+        if (env->num_place_holders > 0) {
+            double* place_holder_ids_d = unpack_float_array(kwargs, "place_holder_ids", &env->num_place_holders);
+            if (place_holder_ids_d != NULL) {
+                env->place_holder_ids = (int*)malloc(env->num_place_holders * sizeof(int));
+                if (env->place_holder_ids == NULL) {
+                    fprintf(stderr, "Error: Failed to allocate memory for place_holder_ids\n");
+                    free(place_holder_ids_d);
+                    env->num_place_holders = 0;
+                } else {
+                    for (int i = 0; i < env->num_place_holders; i++) {
+                        env->place_holder_ids[i] = (int)place_holder_ids_d[i];
+                    }
+                    free(place_holder_ids_d);
+                }
+            } else {
+                env->place_holder_ids = NULL;
+                env->num_place_holders = 0;
+            }
+        } else {
+            env->place_holder_ids = NULL;
+            env->num_place_holders = 0;
+        }
     } else {
         // Non-population play mode - set defaults
         env->num_ego_agents = 0;
         env->ego_agent_ids = NULL;
+        env->num_place_holders = 0;
+        env->place_holder_ids = NULL;
     }
 
-
     env->init_mode = (int)unpack(kwargs, "init_mode");
     env->control_mode = (int)unpack(kwargs, "control_mode");
+    env->create_expert_overflow = (int)unpack(kwargs, "create_expert_overflow");
     env->goal_behavior = (int)unpack(kwargs, "goal_behavior");
     env->goal_radius = (float)unpack(kwargs, "goal_radius");
     int map_id = unpack(kwargs, "map_id");