Emerge-Lab · mpragnay · Feb 10, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 12, 2026
diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
@@ -7,7 +7,7 @@ rnn_name = Recurrent
 [vec]
 num_workers = 16
 num_envs = 16
-batch_size = 4
+batch_size = 2
 ; backend = Serial
 
 [policy]
@@ -23,11 +23,11 @@ num_agents = 1024
 ; Options: discrete, continuous
 action_type = discrete
 ; Options: classic, jerk
-dynamics_model = classic
+dynamics_model = jerk
 reward_vehicle_collision = -0.5
 reward_offroad_collision = -0.5     # Use -0.05 for carla maps
-reward_lane_align = 1
-reward_lane_center = 1
+reward_lane_align = 0
+reward_lane_center = 0
 dt = 0.1
 reward_goal = 0.4
 reward_goal_post_respawn = 0.25
@@ -38,6 +38,7 @@ min_goal_speed = -0.01
 max_goal_speed = 10.0
 ; What to do when the goal is reached. Options: 0:"respawn", 1:"generate_new_goals", 2:"stop"
 goal_behavior = 1
+goal_behavior = 1
 ; Determines the target distance to the new goal in the case of goal_behavior = generate_new_goals.
 ; Large numbers will select a goal point further away from the agent's current position.
 min_goal_distance = 0.5
@@ -48,21 +49,31 @@ collision_behavior = 0
 offroad_behavior = 0
 ; Number of steps before
 episode_length = 300
-resample_frequency = 300
+resample_frequency = 10000
 termination_mode = 1 # 0 - terminate at episode_length, 1 - terminate after all agents have been reset
-map_dir = "resources/drive/binaries/carla_3D"
-num_maps = 10000
+map_dir = "resources/drive/binaries/carla_data"
+num_maps = 8
 ; If True, allows training with fewer maps than requested (warns instead of erroring)
 allow_fewer_maps = True
 ; Determines which step of the trajectory to initialize the agents at upon reset
 init_steps = 0
 ; Options: "control_vehicles", "control_agents", "control_wosac", "control_sdc_only"
 control_mode = "control_vehicles"
-; Options: "created_all_valid", "create_only_controlled"
-init_mode = "create_all_valid"
-reward_randomization = 1
+; Options: "created_all_valid", "create_only_controlled", "random_agents"(creates random number of controlled agents per env)
+init_mode = "random_agents"
+; Below options only valid for "random_agents" init_mode
+min_agents_per_env = 16
+max_agents_per_env = 32
+spawn_width_min = 1.5
+spawn_width_max = 2.5
+spawn_length_min = 2.0
+spawn_length_max = 5.5
+spawn_height = 1.5
+
+; Reward settings
+reward_randomization = 0
 ; Options: 0 - Fixed reward values, 1 - Random reward values
-reward_conditioning = 1
+reward_conditioning = 0
 ; Options: 1 - Add reward coefs to obs array, 0 - Dont
 
 # Reward randomization bounds (min, max)
@@ -116,10 +127,10 @@ reward_bound_acc_max = 1.5
 
 [train]
 seed=42
-total_timesteps = 2_000_000_000
+total_timesteps = 1_000_000_000
 ; learning_rate = 0.02
 ; gamma = 0.985
-anneal_lr = True
+anneal_lr = False
 ; Needs to be: num_agents * num_workers * BPTT horizon
 batch_size = 524288
 minibatch_size = 32768
@@ -129,6 +140,7 @@ adam_beta1 = 0.9
 adam_beta2 = 0.999
 adam_eps = 1e-8
 clip_coef = 0.2
+; Ent coef needs to be tuned for RANDOM_AGENTS mode(found 0.02 to work best)
 ent_coef = 0.005
 gae_lambda = 0.95
 gamma = 0.98
@@ -144,7 +156,7 @@ vtrace_rho_clip = 1
 checkpoint_interval = 250
 ; Rendering options
 render = True
-render_async = False     # Render interval of below 50 might cause process starvation and slowness in training
+render_async = True     # Render interval of below 50 might cause process starvation and slowness in training
 render_interval = 250
 ; If True, show exactly what the agent sees in agent observation
 obs_only = True
@@ -157,7 +169,7 @@ show_human_logs = False
 ; If True, zoom in on a part of the map. Otherwise, show full map
 zoom_in = True
 ; Options: List[str to path], str to path (e.g., "resources/drive/training/binaries/map_001.bin"), None
-render_map = none
+render_map = None
 
 [eval]
 eval_interval = 1000
@@ -242,11 +254,17 @@ mean = 0.98
 max = 0.999
 scale = auto
 
-[controlled_exp.train.goal_speed]
-values = [10, 20, 30, 3]
+; [controlled_exp.train.goal_speed]
+; values = [10, 20, 30, 3]
+
+; [controlled_exp.train.ent_coef]
+; values = [0.001, 0.005, 0.01]
+
+; [controlled_exp.train.seed]
+; values = [42, 55, 1]
 
 [controlled_exp.train.ent_coef]
-values = [0.001, 0.005, 0.01]
+values = [0.025, 0.015]
 
-[controlled_exp.train.seed]
-values = [42, 55, 1]
+[controlled_exp.env.goal_target_distance]
+values = [7.0, 10.0]
diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c
@@ -125,16 +125,60 @@ static PyObject *my_shared(PyObject *self, PyObject *args, PyObject *kwargs) {
     float reward_bound_acc_max = unpack(kwargs, "reward_bound_acc_max");
 
     int use_all_maps = unpack(kwargs, "use_all_maps");
+    int min_agents_per_env = unpack(kwargs, "min_agents_per_env");
+    int max_agents_per_env = unpack(kwargs, "max_agents_per_env");
 
     clock_gettime(CLOCK_REALTIME, &ts);
     srand(ts.tv_nsec);
+
+    int max_envs = use_all_maps ? num_maps : num_agents;
+
+    if (init_mode == RANDOM_AGENTS) {
+        // Training mode: random agent counts per env
+        int agent_counts[max_envs];
+        int remaining = num_agents;
+        int env_count = 0;
+
+        while (remaining > 0) {
+            int count;
+            if (remaining <= max_agents_per_env) {
+                count = remaining;
+            } else {
+                // Ensure last env can still meet min_agents_per_env requirement
+                int upper = (remaining - max_agents_per_env < min_agents_per_env) ? remaining - min_agents_per_env
+                                                                                  : max_agents_per_env;
+                count = min_agents_per_env + rand() % (upper - min_agents_per_env + 1);
+            }
+            agent_counts[env_count++] = count;
+            remaining -= count;
+        }
+
+        PyObject *agent_offsets = PyList_New(env_count + 1);
+        PyObject *map_ids_list = PyList_New(env_count);
+
+        int offset = 0;
+        for (int i = 0; i < env_count; i++) {
+            PyList_SetItem(agent_offsets, i, PyLong_FromLong(offset));
+            PyList_SetItem(map_ids_list, i, PyLong_FromLong(rand() % num_maps));
+            offset += agent_counts[i];
+        }
+        PyList_SetItem(agent_offsets, env_count,
+                       PyLong_FromLong(num_agents)); // In random mode, we guarantee num_agents accross all envs
+        PyObject *tuple = PyTuple_New(3);
+        PyTuple_SetItem(tuple, 0, agent_offsets);
+        PyTuple_SetItem(tuple, 1, map_ids_list);
+        PyTuple_SetItem(tuple, 2, PyLong_FromLong(env_count));
+        return tuple;
+    }
+
+    // For all other modes
     int total_agent_count = 0;
     int env_count = 0;
-    int max_envs = use_all_maps ? num_maps : num_agents;
     int map_idx = 0;
     int maps_checked = 0;
     PyObject *agent_offsets = PyList_New(max_envs + 1);
     PyObject *map_ids = PyList_New(max_envs);
+
     // getting env count
     while (use_all_maps ? map_idx < max_envs : total_agent_count < num_agents && env_count < max_envs) {
         int map_id = use_all_maps ? map_idx++ : rand() % num_maps;
@@ -340,8 +384,21 @@ static int my_init(Env *env, PyObject *args, PyObject *kwargs) {
     char *map_path = unpack_str(kwargs, "map_path");
     int max_agents = unpack(kwargs, "max_agents");
     int init_steps = unpack(kwargs, "init_steps");
+    int max_agents_per_env = unpack(kwargs, "max_agents_per_env");
+
+    AgentSpawnSettings spawn_settings = {
+        .min_w = unpack(kwargs, "spawn_width_min"),
+        .max_w = unpack(kwargs, "spawn_width_max"),
+        .min_l = unpack(kwargs, "spawn_length_min"),
+        .max_l = unpack(kwargs, "spawn_length_max"),
+        .h = unpack(kwargs, "spawn_height"),
+    };
+    env->spawn_settings = spawn_settings;
 
     env->num_agents = max_agents;
+    if (env->init_mode == RANDOM_AGENTS) {
+        env->spawn_settings.max_agents_in_sim = max_agents_per_env; // Random Agents only supports controlled agents
+    }
     env->map_name = map_path;
     env->init_steps = init_steps;
     env->timestep = init_steps;

diff --git a/pufferlib/ocean/drive/datatypes.h b/pufferlib/ocean/drive/datatypes.h
@@ -275,6 +275,13 @@ void free_agent(struct Agent *agent) {
     free(agent->path);
 }
 
+void free_agents(struct Agent *agents, int num_agents) {
+    for (int i = 0; i < num_agents; i++) {
+        free_agent(&agents[i]);
+    }
+    free(agents);
+}
+
 void free_road_element(struct RoadMapElement *element) {
     free(element->x);
     free(element->y);

diff --git a/pufferlib/ocean/drive/drive.c b/pufferlib/ocean/drive/drive.c
@@ -42,6 +42,9 @@ void demo() {
         exit(1);
     }
 
+    // Set different seed each time
+    srand(time(NULL));
+
     // Note: Use below hardcoded settings for 2.0 demo purposes. Since the policy was
     // trained with these exact settings, changing them may lead to
     // weird behavior.
@@ -68,6 +71,15 @@ void demo() {
     //     .map_name = "resources/drive/map_town_02_carla.bin",
     // };
 
+    AgentSpawnSettings spawn_settings = {
+        .max_agents_in_sim = conf.max_agents_per_env,
+        .min_w = conf.spawn_width_min,
+        .max_w = conf.spawn_width_max,
+        .min_l = conf.spawn_length_min,
+        .max_l = conf.spawn_length_max,
+        .h = conf.spawn_height,
+    };
+
     Drive env = {
         .human_agent_idx = 0,
         .action_type = 0, // Demo doesn't support continuous action space
@@ -90,13 +102,19 @@ void demo() {
         .init_steps = conf.init_steps,
         .init_mode = conf.init_mode,
         .control_mode = conf.control_mode,
-        .map_name = "resources/drive/binaries/carla/carla_3D/map_001.bin",
-        .reward_conditioning = 1,
+        .spawn_settings = spawn_settings,
+        .map_name = "resources/drive/binaries/carla/carla_3D/map_000.bin",
+        .reward_conditioning = conf.reward_conditioning,
     };
+
+    if (conf.init_mode == RANDOM_AGENTS) {
+        env.num_agents = conf.min_agents_per_env + rand() % (conf.max_agents_per_env - conf.min_agents_per_env + 1);
+    }
+
     allocate(&env);
     c_reset(&env);
     c_render(&env);
-    Weights *weights = load_weights("resources/drive/puffer_drive_resampling_speed_lane.bin");
+    Weights *weights = load_weights("resources/drive/puffer_drive_zh9lo9pr.bin");
     DriveNet *net = init_drivenet(weights, env.active_agent_count, env.dynamics_model, env.reward_conditioning);
 
     int accel_delta = 1;