@@ -16,26 +16,27 @@ using namespace cpprl;
1616
1717// Algorithm hyperparameters
1818const std::string algorithm = " PPO" ;
19- const int batch_size = 2048 ;
19+ const float actor_loss_coef = 1.0 ;
20+ const int batch_size = 40 ;
2021const float clip_param = 0.2 ;
2122const float discount_factor = 0.99 ;
22- const float entropy_coef = 0.001 ;
23+ const float entropy_coef = 1e-3 ;
2324const float gae = 0.95 ;
24- const float learning_rate = 2.5e-4 ;
25+ const float kl_target = 0.05 ;
26+ const float learning_rate = 7e-4 ;
2527const int log_interval = 1 ;
2628const int max_frames = 10e+7 ;
27- const int num_epoch = 10 ;
28- const int num_mini_batch = 32 ;
29+ const int num_epoch = 3 ;
30+ const int num_mini_batch = 20 ;
2931const int reward_average_window_size = 10 ;
32+ const float reward_clip_value = 10 ; // Post scaling
3033const bool use_gae = true ;
31- const bool use_lr_decay = true ;
32- const float actor_loss_coef = 1.0 ;
34+ const bool use_lr_decay = false ;
3335const float value_loss_coef = 0.5 ;
3436
3537// Environment hyperparameters
36- const float env_gamma = discount_factor; // Set to -1 to disable
37- const std::string env_name = " BipedalWalkerHardcore-v2" ;
38- const int num_envs = 16 ;
38+ const std::string env_name = " LunarLander-v2" ;
39+ const int num_envs = 8 ;
3940const float render_reward_threshold = 160 ;
4041
4142// Model hyperparameters
@@ -80,7 +81,6 @@ int main(int argc, char *argv[])
8081 spdlog::info (" Creating environment" );
8182 auto make_param = std::make_shared<MakeParam>();
8283 make_param->env_name = env_name;
83- make_param->gamma = env_gamma;
8484 make_param->num_envs = num_envs;
8585 Request<MakeParam> make_request (" make" , make_param);
8686 communicator.send_request (make_request);
@@ -125,7 +125,17 @@ int main(int argc, char *argv[])
125125 }
126126 base->to (device);
127127 ActionSpace space{env_info->action_space_type , env_info->action_space_shape };
128- Policy policy (space, base);
128+ Policy policy (nullptr );
129+ if (env_info->observation_space_shape .size () == 1 )
130+ {
131+ // With observation normalization
132+ policy = Policy (space, base, true );
133+ }
134+ else
135+ {
136+ // Without observation normalization
137+ policy = Policy (space, base, true );
138+ }
129139 policy->to (device);
130140 RolloutStorage storage (batch_size, num_envs, env_info->observation_space_shape , space, hidden_size, device);
131141 std::unique_ptr<Algorithm> algo;
@@ -135,7 +145,17 @@ int main(int argc, char *argv[])
135145 }
136146 else if (algorithm == " PPO" )
137147 {
138- algo = std::make_unique<PPO>(policy, clip_param, num_epoch, num_mini_batch, actor_loss_coef, value_loss_coef, entropy_coef, learning_rate);
148+ algo = std::make_unique<PPO>(policy,
149+ clip_param,
150+ num_epoch,
151+ num_mini_batch,
152+ actor_loss_coef,
153+ value_loss_coef,
154+ entropy_coef,
155+ learning_rate,
156+ 1e-8 ,
157+ 0.5 ,
158+ kl_target);
139159 }
140160
141161 storage.set_first_observation (observation);
@@ -144,6 +164,8 @@ int main(int argc, char *argv[])
144164 int episode_count = 0 ;
145165 bool render = false ;
146166 std::vector<float > reward_history (reward_average_window_size);
167+ RunningMeanStd returns_rms (1 );
168+ auto returns = torch::zeros ({num_envs});
147169
148170 auto start_time = std::chrono::high_resolution_clock::now ();
149171
@@ -159,14 +181,21 @@ int main(int argc, char *argv[])
159181 storage.get_hidden_states ()[step],
160182 storage.get_masks ()[step]);
161183 }
162- auto actions_tensor = act_result[1 ].cpu ();
184+ auto actions_tensor = act_result[1 ].cpu (). to (torch:: kFloat ) ;
163185 float *actions_array = actions_tensor.data <float >();
164186 std::vector<std::vector<float >> actions (num_envs);
165187 for (int i = 0 ; i < num_envs; ++i)
166188 {
167- for (int j = 0 ; j < env_info->action_space_shape [0 ]; j++)
189+ if (space.type == " Discrete" )
190+ {
191+ actions[i] = {actions_array[i]};
192+ }
193+ else
168194 {
169- actions[i].push_back (actions_array[i * env_info->action_space_shape [0 ] + j]);
195+ for (int j = 0 ; j < env_info->action_space_shape [0 ]; j++)
196+ {
197+ actions[i].push_back (actions_array[i * env_info->action_space_shape [0 ] + j]);
198+ }
170199 }
171200 }
172201
@@ -183,7 +212,13 @@ int main(int argc, char *argv[])
183212 auto step_result = communicator.get_response <CnnStepResponse>();
184213 observation_vec = flatten_vector (step_result->observation );
185214 observation = torch::from_blob (observation_vec.data (), observation_shape).to (device);
186- rewards = flatten_vector (step_result->reward );
215+ auto raw_reward_vec = flatten_vector (step_result->real_reward );
216+ auto reward_tensor = torch::from_blob (raw_reward_vec.data (), {num_envs}, torch::kFloat );
217+ returns = returns * discount_factor + reward_tensor;
218+ returns_rms->update (returns);
219+ reward_tensor = torch::clamp (reward_tensor / torch::sqrt (returns_rms->get_variance () + 1e-8 ),
220+ -reward_clip_value, reward_clip_value);
221+ rewards = std::vector<float >(reward_tensor.data <float >(), reward_tensor.data <float >() + reward_tensor.numel ());
187222 real_rewards = flatten_vector (step_result->real_reward );
188223 dones_vec = step_result->done ;
189224 }
@@ -192,7 +227,13 @@ int main(int argc, char *argv[])
192227 auto step_result = communicator.get_response <MlpStepResponse>();
193228 observation_vec = flatten_vector (step_result->observation );
194229 observation = torch::from_blob (observation_vec.data (), observation_shape).to (device);
195- rewards = flatten_vector (step_result->reward );
230+ auto raw_reward_vec = flatten_vector (step_result->real_reward );
231+ auto reward_tensor = torch::from_blob (raw_reward_vec.data (), {num_envs}, torch::kFloat );
232+ returns = returns * discount_factor + reward_tensor;
233+ returns_rms->update (returns);
234+ reward_tensor = torch::clamp (reward_tensor / torch::sqrt (returns_rms->get_variance () + 1e-8 ),
235+ -reward_clip_value, reward_clip_value);
236+ rewards = std::vector<float >(reward_tensor.data <float >(), reward_tensor.data <float >() + reward_tensor.numel ());
196237 real_rewards = flatten_vector (step_result->real_reward );
197238 dones_vec = step_result->done ;
198239 }
@@ -203,6 +244,7 @@ int main(int argc, char *argv[])
203244 {
204245 reward_history[episode_count % reward_average_window_size] = running_rewards[i];
205246 running_rewards[i] = 0 ;
247+ returns[i] = 0 ;
206248 episode_count++;
207249 }
208250 }
0 commit comments