Skip to content

Commit ce388a4

Browse files
authored
[rllib] Learner should not see clipped actions (#3496)
1 parent 87c0d24 commit ce388a4

File tree

5 files changed

+75
-22
lines changed

5 files changed

+75
-22
lines changed

doc/source/rllib-env.rst

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,21 @@ ARS **Yes** **Yes** No No
2424

2525
.. _`+parametric`: rllib-models.html#variable-length-parametric-action-spaces
2626

27-
You can pass either a string name or a Python class to specify an environment. By default, strings will be interpreted as a gym `environment name <https://gym.openai.com/envs>`__. Custom env classes must take a single ``env_config`` parameter in their constructor:
27+
You can pass either a string name or a Python class to specify an environment. By default, strings will be interpreted as a gym `environment name <https://gym.openai.com/envs>`__. Custom env classes passed directly to the agent must take a single ``env_config`` parameter in their constructor:
2828

2929
.. code-block:: python
3030
31-
import ray
31+
import gym, ray
3232
from ray.rllib.agents import ppo
3333
3434
class MyEnv(gym.Env):
3535
def __init__(self, env_config):
36-
self.action_space = ...
37-
self.observation_space = ...
38-
...
36+
self.action_space = <gym.Space>
37+
self.observation_space = <gym.Space>
38+
def reset(self):
39+
return <obs>
40+
def step(self, action):
41+
return <obs>, <reward: float>, <done: bool>, <info: dict>
3942
4043
ray.init()
4144
trainer = ppo.PPOAgent(env=MyEnv, config={

python/ray/rllib/evaluation/sampler.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -287,12 +287,12 @@ def new_episode():
287287

288288
# Do batched policy eval
289289
eval_results = _do_policy_eval(tf_sess, to_eval, policies,
290-
active_episodes, clip_actions)
290+
active_episodes)
291291

292292
# Process results and update episode state
293293
actions_to_send = _process_policy_eval_results(
294294
to_eval, eval_results, active_episodes, active_envs,
295-
off_policy_actions)
295+
off_policy_actions, policies, clip_actions)
296296

297297
# Return computed actions to ready envs. We also send to envs that have
298298
# taken off-policy actions; those envs are free to ignore the action.
@@ -448,7 +448,7 @@ def _process_observations(async_vector_env, policies, batch_builder_pool,
448448
return active_envs, to_eval, outputs
449449

450450

451-
def _do_policy_eval(tf_sess, to_eval, policies, active_episodes, clip_actions):
451+
def _do_policy_eval(tf_sess, to_eval, policies, active_episodes):
452452
"""Call compute actions on observation batches to get next actions.
453453
454454
Returns:
@@ -483,18 +483,12 @@ def _do_policy_eval(tf_sess, to_eval, policies, active_episodes, clip_actions):
483483
for k, v in pending_fetches.items():
484484
eval_results[k] = builder.get(v)
485485

486-
if clip_actions:
487-
for policy_id, results in eval_results.items():
488-
policy = _get_or_raise(policies, policy_id)
489-
actions, rnn_out_cols, pi_info_cols = results
490-
eval_results[policy_id] = (_clip_actions(
491-
actions, policy.action_space), rnn_out_cols, pi_info_cols)
492-
493486
return eval_results
494487

495488

496489
def _process_policy_eval_results(to_eval, eval_results, active_episodes,
497-
active_envs, off_policy_actions):
490+
active_envs, off_policy_actions, policies,
491+
clip_actions):
498492
"""Process the output of policy neural network evaluation.
499493
500494
Records policy evaluation results into the given episode objects and
@@ -521,10 +515,15 @@ def _process_policy_eval_results(to_eval, eval_results, active_episodes,
521515
pi_info_cols["state_out_{}".format(f_i)] = column
522516
# Save output rows
523517
actions = _unbatch_tuple_actions(actions)
518+
policy = _get_or_raise(policies, policy_id)
524519
for i, action in enumerate(actions):
525520
env_id = eval_data[i].env_id
526521
agent_id = eval_data[i].agent_id
527-
actions_to_send[env_id][agent_id] = action
522+
if clip_actions:
523+
actions_to_send[env_id][agent_id] = _clip_actions(
524+
action, policy.action_space)
525+
else:
526+
actions_to_send[env_id][agent_id] = action
528527
episode = active_episodes[env_id]
529528
episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols])
530529
episode._set_last_pi_info(
@@ -562,7 +561,7 @@ def _clip_actions(actions, space):
562561
"""Called to clip actions to the specified range of this policy.
563562
564563
Arguments:
565-
actions: Batch of actions or TupleActions.
564+
actions: Single action.
566565
space: Action space the actions should be present in.
567566
568567
Returns:
@@ -572,13 +571,13 @@ def _clip_actions(actions, space):
572571
if isinstance(space, gym.spaces.Box):
573572
return np.clip(actions, space.low, space.high)
574573
elif isinstance(space, gym.spaces.Tuple):
575-
if not isinstance(actions, TupleActions):
574+
if type(actions) not in (tuple, list):
576575
raise ValueError("Expected tuple space for actions {}: {}".format(
577576
actions, space))
578577
out = []
579-
for a, s in zip(actions.batches, space.spaces):
578+
for a, s in zip(actions, space.spaces):
580579
out.append(_clip_actions(a, s))
581-
return TupleActions(out)
580+
return out
582581
else:
583582
return actions
584583

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""Integration test: (1) pendulum works, (2) single-agent multi-agent works."""
2+
3+
from __future__ import absolute_import
4+
from __future__ import division
5+
from __future__ import print_function
6+
7+
import ray
8+
from ray.rllib.test.test_multi_agent_env import make_multiagent
9+
from ray.tune import run_experiments
10+
from ray.tune.registry import register_env
11+
12+
if __name__ == "__main__":
13+
ray.init()
14+
MultiPendulum = make_multiagent("Pendulum-v0")
15+
register_env("multi_pend", lambda _: MultiPendulum(1))
16+
trials = run_experiments({
17+
"test": {
18+
"run": "PPO",
19+
"env": "multi_pend",
20+
"stop": {
21+
"timesteps_total": 500000,
22+
"episode_reward_mean": -200,
23+
},
24+
"config": {
25+
"train_batch_size": 2048,
26+
"vf_clip_param": 10.0,
27+
"num_workers": 0,
28+
"num_envs_per_worker": 10,
29+
"lambda": 0.1,
30+
"gamma": 0.95,
31+
"lr": 0.0003,
32+
"sgd_minibatch_size": 64,
33+
"num_sgd_iter": 10,
34+
"model": {
35+
"fcnet_hiddens": [64, 64],
36+
},
37+
"batch_mode": "complete_episodes",
38+
},
39+
}
40+
})
41+
if trials[0].last_result["episode_reward_mean"] < -200:
42+
raise ValueError("Did not get to -200 reward", trials[0].last_result)

python/ray/rllib/tuned_examples/pendulum-ppo.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ pendulum-ppo:
55
config:
66
train_batch_size: 2048
77
vf_clip_param: 10.0
8-
num_workers: 2
8+
num_workers: 0
9+
num_envs_per_worker: 10
910
lambda: 0.1
1011
gamma: 0.95
1112
lr: 0.0003

test/jenkins_tests/run_multi_node_tests.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,14 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
299299
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
300300
/ray/python/ray/rllib/test/test_rollout.sh
301301

302+
# Try a couple times since it's stochastic
303+
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
304+
python /ray/python/ray/rllib/test/multiagent_pendulum.py || \
305+
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
306+
python /ray/python/ray/rllib/test/multiagent_pendulum.py || \
307+
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
308+
python /ray/python/ray/rllib/test/multiagent_pendulum.py
309+
302310
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
303311
python /ray/python/ray/tune/examples/tune_mnist_ray.py \
304312
--smoke-test

0 commit comments

Comments
 (0)