Skip to content
This repository was archived by the owner on Dec 28, 2023. It is now read-only.

Commit f51c086

Browse files
author
Omegastick
committed
Add actor loss coefficient hyperparameter to PPO
1 parent ccc4177 commit f51c086

File tree

3 files changed

+9
-5
lines changed

3 files changed

+9
-5
lines changed

example/gym_client.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ const int num_mini_batch = 32;
2929
const int reward_average_window_size = 10;
3030
const bool use_gae = true;
3131
const bool use_lr_decay = true;
32+
const float actor_loss_coef = 1.0;
3233
const float value_loss_coef = 0.5;
3334

3435
// Environment hyperparameters
@@ -134,7 +135,7 @@ int main(int argc, char *argv[])
134135
}
135136
else if (algorithm == "PPO")
136137
{
137-
algo = std::make_unique<PPO>(policy, clip_param, num_epoch, num_mini_batch, value_loss_coef, entropy_coef, learning_rate);
138+
algo = std::make_unique<PPO>(policy, clip_param, num_epoch, num_mini_batch, actor_loss_coef, value_loss_coef, entropy_coef, learning_rate);
138139
}
139140

140141
storage.set_first_observation(observation);

include/cpprl/algorithms/ppo.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class PPO : public Algorithm
1616
{
1717
private:
1818
Policy &policy;
19-
float value_loss_coef, entropy_coef, max_grad_norm, original_learning_rate, original_clip_param;
19+
float actor_loss_coef, value_loss_coef, entropy_coef, max_grad_norm, original_learning_rate, original_clip_param;
2020
int num_epoch, num_mini_batch;
2121
std::unique_ptr<torch::optim::Adam> optimizer;
2222

@@ -25,6 +25,7 @@ class PPO : public Algorithm
2525
float clip_param,
2626
int num_epoch,
2727
int num_mini_batch,
28+
float actor_loss_coef,
2829
float value_loss_coef,
2930
float entropy_coef,
3031
float learning_rate,

src/algorithms/ppo.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@ PPO::PPO(Policy &policy,
1919
float clip_param,
2020
int num_epoch,
2121
int num_mini_batch,
22+
float actor_loss_coef,
2223
float value_loss_coef,
2324
float entropy_coef,
2425
float learning_rate,
2526
float epsilon,
2627
float max_grad_norm)
2728
: policy(policy),
29+
actor_loss_coef(actor_loss_coef),
2830
value_loss_coef(value_loss_coef),
2931
entropy_coef(entropy_coef),
3032
max_grad_norm(max_grad_norm),
@@ -104,7 +106,7 @@ std::vector<UpdateDatum> PPO::update(RolloutStorage &rollouts, float decay_level
104106

105107
// Total loss
106108
auto loss = (value_loss * value_loss_coef +
107-
action_loss -
109+
action_loss * actor_loss_coef -
108110
evaluate_result[2] * entropy_coef);
109111

110112
// Step optimizer
@@ -139,7 +141,7 @@ TEST_CASE("PPO")
139141
ActionSpace space{"Discrete", {2}};
140142
Policy policy(space, base);
141143
RolloutStorage storage(20, 2, {1}, space, 5, torch::kCPU);
142-
PPO ppo(policy, 0.2, 3, 5, 0.5, 1e-3, 0.001);
144+
PPO ppo(policy, 0.2, 3, 5, 1, 0.5, 1e-3, 0.001);
143145

144146
// The reward is the action
145147
auto pre_game_probs = policy->get_probs(
@@ -208,7 +210,7 @@ TEST_CASE("PPO")
208210
ActionSpace space{"Discrete", {2}};
209211
Policy policy(space, base);
210212
RolloutStorage storage(20, 2, {1}, space, 5, torch::kCPU);
211-
PPO ppo(policy, 0.2, 3, 5, 0.5, 1e-3, 0.001);
213+
PPO ppo(policy, 0.2, 3, 5, 1, 0.5, 1e-3, 0.001);
212214

213215
// The game is: If the action matches the input, give a reward of 1, otherwise -1
214216
auto pre_game_probs = policy->get_probs(

0 commit comments

Comments
 (0)