Skip to content

Commit 78d7008

Browse files
Ervin Tvincentpierre
Ervin T
authored andcommitted
[bug-fix] Improve performance for PPO with continuous actions (#3662)
1 parent f864337 commit 78d7008

File tree

3 files changed

+29
-13
lines changed

3 files changed

+29
-13
lines changed

ml-agents/mlagents/trainers/distributions.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def __init__(
6464
act_size: List[int],
6565
reparameterize: bool = False,
6666
tanh_squash: bool = False,
67+
condition_sigma: bool = True,
6768
log_sigma_min: float = -20,
6869
log_sigma_max: float = 2,
6970
):
@@ -79,7 +80,11 @@ def __init__(
7980
:param log_sigma_max: Maximum log standard deviation to clip by.
8081
"""
8182
encoded = self._create_mu_log_sigma(
82-
logits, act_size, log_sigma_min, log_sigma_max
83+
logits,
84+
act_size,
85+
log_sigma_min,
86+
log_sigma_max,
87+
condition_sigma=condition_sigma,
8388
)
8489
self._sampled_policy = self._create_sampled_policy(encoded)
8590
if not reparameterize:
@@ -101,6 +106,7 @@ def _create_mu_log_sigma(
101106
act_size: List[int],
102107
log_sigma_min: float,
103108
log_sigma_max: float,
109+
condition_sigma: bool,
104110
) -> "GaussianDistribution.MuSigmaTensors":
105111

106112
mu = tf.layers.dense(
@@ -112,14 +118,22 @@ def _create_mu_log_sigma(
112118
reuse=tf.AUTO_REUSE,
113119
)
114120

115-
# Policy-dependent log_sigma_sq
116-
log_sigma = tf.layers.dense(
117-
logits,
118-
act_size[0],
119-
activation=None,
120-
name="log_std",
121-
kernel_initializer=ModelUtils.scaled_init(0.01),
122-
)
121+
if condition_sigma:
122+
# Policy-dependent log_sigma_sq
123+
log_sigma = tf.layers.dense(
124+
logits,
125+
act_size[0],
126+
activation=None,
127+
name="log_std",
128+
kernel_initializer=ModelUtils.scaled_init(0.01),
129+
)
130+
else:
131+
log_sigma = tf.get_variable(
132+
"log_std",
133+
[act_size[0]],
134+
dtype=tf.float32,
135+
initializer=tf.zeros_initializer(),
136+
)
123137
log_sigma = tf.clip_by_value(log_sigma, log_sigma_min, log_sigma_max)
124138
sigma = tf.exp(log_sigma)
125139
return self.MuSigmaTensors(mu, log_sigma, sigma)
@@ -155,8 +169,8 @@ def _do_squash_correction_for_tanh(self, probs, squashed_policy):
155169
"""
156170
Adjust probabilities for squashed sample before output
157171
"""
158-
probs -= tf.log(1 - squashed_policy ** 2 + EPSILON)
159-
return probs
172+
adjusted_probs = probs - tf.log(1 - squashed_policy ** 2 + EPSILON)
173+
return adjusted_probs
160174

161175
@property
162176
def total_log_probs(self) -> tf.Tensor:

ml-agents/mlagents/trainers/policy/nn_policy.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ def _create_cc_actor(
202202
self.act_size,
203203
reparameterize=reparameterize,
204204
tanh_squash=tanh_squash,
205+
condition_sigma=condition_sigma_on_obs,
205206
)
206207

207208
if tanh_squash:

ml-agents/mlagents/trainers/tests/test_simple_rl.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,13 +220,14 @@ def test_visual_advanced_ppo(vis_encode_type, num_visual):
220220
def test_recurrent_ppo(use_discrete):
221221
env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
222222
override_vals = {
223-
"max_steps": 3000,
223+
"max_steps": 4000,
224224
"batch_size": 64,
225225
"buffer_size": 128,
226+
"learning_rate": 1e-3,
226227
"use_recurrent": True,
227228
}
228229
config = generate_config(PPO_CONFIG, override_vals)
229-
_check_environment_trains(env, config)
230+
_check_environment_trains(env, config, success_threshold=0.9)
230231

231232

232233
@pytest.mark.parametrize("use_discrete", [True, False])

0 commit comments

Comments
 (0)