Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pufferlib/ocean/drive/binding.c
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,10 @@ static int my_log(PyObject *dict, Log *log) {
assign_to_dict(dict, "goals_sampled_this_episode", log->goals_sampled_this_episode);
assign_to_dict(dict, "goals_reached_this_episode", log->goals_reached_this_episode);
assign_to_dict(dict, "speed_at_goal", log->speed_at_goal);
assign_to_dict(dict, "lane_center_rate", log->lane_center_rate);
assign_to_dict(dict, "comfort_violation_count", log->comfort_violation_count);
assign_to_dict(dict, "velocity_progress_sum", log->velocity_progress_sum);
assign_to_dict(dict, "avg_speed_per_agent", log->avg_speed_per_agent);
// assign_to_dict(dict, "avg_displacement_error", log->avg_displacement_error);
return 0;
}
191 changes: 151 additions & 40 deletions pufferlib/ocean/drive/drive.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,10 @@
// Offsets
#define COLLISION_RANGE 5
#define Z_RANGE 3
#define Z_BUFFER 4.0f // 4.0m buffer for different z level checking
#define Z_BUFFER 4.0f // 4.0m buffer for different z level checking
#define SPEED_LIMIT 20.0f // Hardcoded speed limit value
#define COMFORT_JERK_THRESHOLD 5.0f // For JERK model comfort
#define COMFORT_ACCEL_THRESHOLD 3.0f // For JERK and CLASSIC model comfort

// Jerk action space (for JERK dynamics model)
static const float JERK_LONG[4] = {-15.0f, -4.0f, 0.0f, 4.0f};
Expand Down Expand Up @@ -203,16 +206,24 @@ struct Log {
float goals_sampled_this_episode;
float offroad_rate;
float collision_rate;
float red_light_violation_rate; // placeholder for later
float num_goals_reached;
float comfort_violation_count;
float velocity_progress_sum;
float lane_center_rate;
float lane_heading_aligned_rate;
float completion_rate;
float offroad_per_agent;
float collisions_per_agent;
float dnf_rate;
float avg_displacement_error;
float n;
float lane_alignment_rate;
float speed_at_goal;
float active_agent_count;
float expert_static_agent_count;
float static_agent_count;
float avg_speed_per_agent;
};

typedef struct GridMapEntity GridMapEntity;
Expand Down Expand Up @@ -1277,6 +1288,7 @@ bool check_line_intersection(float p1[2], float p2[2], float q1[2], float q2[2])
}

void add_log(Drive *env) {
int safe_timestep = (env->timestep > 0) ? env->timestep : 1;
for (int i = 0; i < env->active_agent_count; i++) {
int agent_idx = env->active_agent_indices[i];
Agent *agent = &env->agents[agent_idx];
Expand All @@ -1292,7 +1304,8 @@ void add_log(Drive *env) {
env->log.offroad_per_agent += offroad_per_agent;
float collisions_per_agent = env->logs[i].collisions_per_agent;
env->log.collisions_per_agent += collisions_per_agent;

float avg_speed_per_agent = env->logs[i].avg_speed_per_agent;
env->log.avg_speed_per_agent += avg_speed_per_agent / safe_timestep;
float frac_goal_reached = agent->goals_reached_this_episode / agent->goals_sampled_this_episode;

// Update score, which is an aggregate measure whether the agent fully solved its task
Expand All @@ -1319,10 +1332,13 @@ void add_log(Drive *env) {
env->log.speed_at_goal += env->logs[i].speed_at_goal;
env->log.episode_length += env->logs[i].episode_length;
env->log.episode_return += env->logs[i].episode_return;
env->log.comfort_violation_count += env->logs[i].comfort_violation_count / safe_timestep;
env->log.velocity_progress_sum += env->logs[i].velocity_progress_sum / safe_timestep;
// Log composition counts per agent so vec_log averaging recovers the per-env value
env->log.active_agent_count += env->active_agent_count;
env->log.expert_static_agent_count += env->expert_static_agent_count;
env->log.static_agent_count += env->static_agent_count;
env->log.lane_center_rate += env->logs[i].lane_center_rate / safe_timestep;
env->log.n += 1;
}
}
Expand Down Expand Up @@ -1910,6 +1926,33 @@ void compute_agent_metrics(Drive *env, int agent_idx) {
int lane_aligned = (fabs(agent->metrics_array[LANE_ANGLE_IDX]) > 0.965) ? 1 : 0;
agent->metrics_array[LANE_ALIGNED_IDX] = lane_aligned;
}
float speed_magnitude = sqrtf(agent->sim_vx * agent->sim_vx + agent->sim_vy * agent->sim_vy);
// Velocity metric (GIGAFLOW) - forward progress aligned with lane
const float VELOCITY_MIN_SPEED = 2.5f; // m/s
if (speed_magnitude > VELOCITY_MIN_SPEED && best_candidate_entity_idx != -1) {
float cos_theta = agent->metrics_array[LANE_ANGLE_IDX];
agent->metrics_array[VELOCITY_PROGRESS_IDX] = fmaxf(cos_theta, 0.0f);
} else {
agent->metrics_array[VELOCITY_PROGRESS_IDX] = 0.0f;
}

// Comfort metric (GIGAFLOW) - only for JERK dynamics, modified for CLASSIC (Custom)
int accel_violation =
(fabsf(agent->a_long) > COMFORT_ACCEL_THRESHOLD) + (fabsf(agent->a_lat) > COMFORT_ACCEL_THRESHOLD);

if (env->dynamics_model == JERK) {

int jerk_violation =
(fabsf(agent->jerk_long) > COMFORT_JERK_THRESHOLD || fabsf(agent->jerk_lat) > COMFORT_JERK_THRESHOLD) ? 1
: 0;
agent->metrics_array[COMFORT_VIOLATION_IDX] = (float)(accel_violation + jerk_violation);
} else {
agent->metrics_array[COMFORT_VIOLATION_IDX] = (float)(accel_violation);
}

// Overspeed penalization - Hardcoded target speed value of 20m/s right now - WILL HAVE TO BE UPDATED ONCE NEW DATA
// FORMAT COMES IN
agent->metrics_array[SPEED_LIMIT_IDX] = (speed_magnitude > SPEED_LIMIT + 2.0f) ? 1.0f : 0.0f;

// Check for vehicle collisions
int car_collided_with_index = collision_check(env, agent_idx);
Expand Down Expand Up @@ -1966,10 +2009,6 @@ void compute_observations(Drive *env) {
float v_dot_heading = ego_entity->sim_vx * cos_heading + ego_entity->sim_vy * sin_heading;
float signed_speed = copysignf(speed_magnitude, v_dot_heading);

// Adding speed limit calculation
float speed_limit = 20.0f;
// We need to add speed limit calculation

// Adding lane angle and center information
float lane_center_dist = ego_entity->metrics_array[LANE_DIST_IDX] / LANE_DISTANCE_NORMALIZATION;
lane_center_dist = fmaxf(-1.0f, fminf(1.0f, lane_center_dist));
Expand Down Expand Up @@ -2002,14 +2041,14 @@ void compute_observations(Drive *env) {
obs[10] = (ego_entity->respawn_timestep != -1) ? 1 : 0;
obs[11] = normalized_goal_speed_min;
obs[12] = normalized_goal_speed_max;
obs[13] = fminf(speed_limit / MAX_SPEED, 1.0f);
obs[13] = fminf(SPEED_LIMIT / MAX_SPEED, 1.0f);
obs[14] = lane_center_dist;
obs[15] = ego_entity->metrics_array[LANE_ANGLE_IDX];
} else {
obs[7] = (ego_entity->respawn_timestep != -1) ? 1 : 0;
obs[8] = normalized_goal_speed_min;
obs[9] = normalized_goal_speed_max;
obs[10] = fminf(speed_limit / MAX_SPEED, 1.0f);
obs[10] = fminf(SPEED_LIMIT / MAX_SPEED, 1.0f);
obs[11] = lane_center_dist;
obs[12] = ego_entity->metrics_array[LANE_ANGLE_IDX];
}
Expand Down Expand Up @@ -2506,24 +2545,40 @@ void c_step(Drive *env) {
}
}

// Compute rewards
// COMPUTE REWARDS
for (int i = 0; i < env->active_agent_count; i++) {
int agent_idx = env->active_agent_indices[i];
Agent *agent = &env->agents[agent_idx];
env->agents[agent_idx].collision_state = 0;
env->agents[agent_idx].aabb_collision_state = 0;
compute_agent_metrics(env, agent_idx);
int collision_state = env->agents[agent_idx].collision_state;
float cos_heading = cosf(agent->sim_heading);
float sin_heading = sinf(agent->sim_heading);
float speed_magnitude = sqrtf(agent->sim_vx * agent->sim_vx + agent->sim_vy * agent->sim_vy);
float v_dot_heading = agent->sim_vx * cos_heading + agent->sim_vy * sin_heading;
float signed_speed = copysignf(speed_magnitude, v_dot_heading);

if (collision_state > 0) {
if (collision_state == VEHICLE_COLLISION) {
env->rewards[i] += env->reward_vehicle_collision;
env->logs[i].episode_return += env->reward_vehicle_collision;
float reward_collision;
if (env->reward_conditioning) {
reward_collision = agent->reward_coefs[REWARD_COEF_COLLISION] - 0.1f * speed_magnitude;
} else {
reward_collision = env->reward_vehicle_collision;
}
env->rewards[i] += reward_collision;
env->logs[i].episode_return += reward_collision;
env->logs[i].collision_rate = 1.0f;
env->logs[i].collisions_per_agent += 1.0f;
} else if (collision_state == OFFROAD) {
env->rewards[i] += env->reward_offroad_collision;
env->logs[i].episode_return += env->reward_offroad_collision;
if (env->reward_conditioning) {
env->rewards[i] += agent->reward_coefs[REWARD_COEF_OFFROAD];
env->logs[i].episode_return += agent->reward_coefs[REWARD_COEF_OFFROAD];
} else {
env->rewards[i] += env->reward_offroad_collision;
env->logs[i].episode_return = env->reward_offroad_collision;
}
env->logs[i].offroad_rate = 1.0f;
env->logs[i].offroad_per_agent += 1.0f;
}
Expand Down Expand Up @@ -2567,43 +2622,99 @@ void c_step(Drive *env) {
env->agents[agent_idx].metrics_array[REACHED_GOAL_IDX] = 1.0f;
env->logs[i].speed_at_goal = current_speed;
}

// Add values to logs first

// Lane alignment
int lane_aligned = env->agents[agent_idx].metrics_array[LANE_ALIGNED_IDX];
env->logs[i].lane_alignment_rate = lane_aligned;

// Add lane rewards
// Get lane angle metric: cos(θ_f) where θ_f = heading diff from lane
float cos_theta = agent->metrics_array[LANE_ANGLE_IDX];
float theta_f = acosf(fminf(fmaxf(cos_theta, -1.0f), 1.0f)); // Get |θ_f| from cos
// env->logs[i].lane_heading_aligned_rate += (cos_theta >= LANE_ALIGN_COS_THRESHOLD) ? 1.0f : 0.0f; <- Commented
// becaue this is already catered by us
// Lane center distance
float lane_center_distance = agent->metrics_array[LANE_DIST_IDX];
env->logs[i].lane_center_rate += fabsf(lane_center_distance) < 0.5f ? 1.0f : 0.0f;

// Rl-align (GIGAFLOW): min(cos,0) + vel_align*min(cos*v,0) + 0.0025*(1-|θ|/(π/2))
float against_lane_penalty = fminf(cos_theta, 0.0f); // negative when >90 degrees off
float vel_aligned_penalty =
agent->reward_coefs[REWARD_COEF_VEL_ALIGN] * fminf(cos_theta * agent->sim_speed, 0.0f);
float alignment_bonus = 0.0025f * (1.0f - theta_f / (M_PI / 2.0f));
// Comfort violation Count
float comfort_violations = agent->metrics_array[COMFORT_VIOLATION_IDX];
env->logs[i].comfort_violation_count += comfort_violations;

float lane_align_reward = agent->reward_coefs[REWARD_COEF_LANE_ALIGN] * env->dt *
(against_lane_penalty + vel_aligned_penalty + alignment_bonus);
// Velocity progress sum
float velocity_progress = agent->metrics_array[VELOCITY_PROGRESS_IDX];
env->logs[i].velocity_progress_sum += velocity_progress;

env->rewards[i] += lane_align_reward;
env->logs[i].episode_return += lane_align_reward;
// Average speed per agent
env->logs[i].avg_speed_per_agent += speed_magnitude;

// Rl-center (GIGAFLOW): -α * dt * (|x_f - bias| - 0.05/(exp(|x_f - bias| - 0.5))
float lane_center_distance = agent->metrics_array[LANE_DIST_IDX];
float adjusted_dist = fabsf(lane_center_distance - agent->reward_coefs[REWARD_COEF_CENTER_BIAS]);
float exp_decay = 0.05f / expf(adjusted_dist - 0.5f);
// We only add lane level rewards when conditioning is turned on
if (env->reward_conditioning) {

float lane_center_reward =
agent->reward_coefs[REWARD_COEF_LANE_CENTER] * env->dt * ((cos_theta > 0.5f) * adjusted_dist - exp_decay);
// Add lane rewards
// Get lane angle metric: cos(θ_f) where θ_f = heading diff from lane
float cos_theta = agent->metrics_array[LANE_ANGLE_IDX];
float theta_f = acosf(fminf(fmaxf(cos_theta, -1.0f), 1.0f)); // Get |θ_f| from cos
// env->logs[i].lane_heading_aligned_rate += (cos_theta >= LANE_ALIGN_COS_THRESHOLD) ? 1.0f : 0.0f; <-
// Commented becaue this is already catered by us

env->rewards[i] += lane_center_reward;
// env->logs[i].lane_center_rate += fabsf(lane_center_distance) < 0.5f ? 1.0f : 0.0f; <- Commented for now (
// need to add this to add_log)
env->logs[i].episode_return += lane_center_reward;
//
// Rl-align (GIGAFLOW): min(cos,0) + vel_align*min(cos*v,0) + 0.0025*(1-|θ|/(π/2))
float against_lane_penalty = fminf(cos_theta, 0.0f); // negative when >90 degrees off
float vel_aligned_penalty =
agent->reward_coefs[REWARD_COEF_VEL_ALIGN] * fminf(cos_theta * speed_magnitude, 0.0f);
float alignment_bonus = 0.0025f * (1.0f - theta_f / (M_PI / 2.0f));

float lane_align_reward = agent->reward_coefs[REWARD_COEF_LANE_ALIGN] * env->dt *
(against_lane_penalty + vel_aligned_penalty + alignment_bonus);

env->rewards[i] += lane_align_reward;
env->logs[i].episode_return += lane_align_reward;

// Rl-center (GIGAFLOW): -α * dt * (|x_f - bias| - 0.05/(exp(|x_f - bias| - 0.5))
float adjusted_dist = fabsf(lane_center_distance - agent->reward_coefs[REWARD_COEF_CENTER_BIAS]);
float exp_decay = 0.05f / expf(adjusted_dist - 0.5f);
float lane_center_reward = agent->reward_coefs[REWARD_COEF_LANE_CENTER] * env->dt *
((cos_theta > 0.5f) * adjusted_dist - exp_decay);

env->rewards[i] += lane_center_reward;
env->logs[i].episode_return += lane_center_reward;
// OTHER REWARDS (From Gigaflow)

// Red light violation reward (GIGAFLOW) - OMITTED

// Comfort reward (GIGAFLOW)

float comfort_penalty = agent->reward_coefs[REWARD_COEF_COMFORT] * comfort_violations;

env->rewards[i] += comfort_penalty;
env->logs[i].episode_return += comfort_penalty;

// Further support TO BE ADDED for all other types of reward conditioning
// Velocity reward (GIGAFLOW)
float velocity_reward = agent->reward_coefs[REWARD_COEF_VELOCITY] * env->dt * velocity_progress;

env->rewards[i] += velocity_reward;
env->logs[i].episode_return += velocity_reward;

// Timestep reward (GIGAFLOW)
float accel = sqrtf(agent->a_long * agent->a_long + agent->a_lat * agent->a_lat);
// Only penalize when moving (v > 0) or accelerating (a > 0)
if (speed_magnitude > 0.01f || accel > 0.01f) {
float timestep_penalty = agent->reward_coefs[REWARD_COEF_TIMESTEP] * env->dt;

env->rewards[i] += timestep_penalty;
env->logs[i].episode_return += timestep_penalty;
}

// Reverse reward (GIGAFLOW)
if (signed_speed < -0.01f) {
float reverse_penalty = agent->reward_coefs[REWARD_COEF_REVERSE] * env->dt;

env->rewards[i] += reverse_penalty;
env->logs[i].episode_return += reverse_penalty;
}

// Over speed reward (GIGAFLOW++)
float speed_reward = agent->reward_coefs[REWARD_COEF_OVERSPEED] * agent->metrics_array[SPEED_LIMIT_IDX];

env->rewards[i] += speed_reward;
env->logs[i].episode_return += speed_reward;
}
Comment on lines +2648 to +2717
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lane rewards dropped when reward_conditioning is off

Previously, lane alignment and lane center rewards were always computed and applied unconditionally. This PR moves them inside the if (env->reward_conditioning) block, meaning when reward_conditioning is disabled, agents no longer receive any lane-level rewards at all.

If this is intentional (i.e., the non-conditioning path is expected to not have lane rewards), it would be helpful to document that somewhere. If not, the lane align and lane center reward logic should be moved back outside the conditioning block as before, with only the new rewards (comfort, velocity, timestep, reverse, overspeed) guarded by the flag.

}

if (env->goal_behavior == GOAL_RESPAWN) {
Expand Down