Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[rllib] Learner should not see clipped actions #3496

Merged
merged 10 commits into from
Dec 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions doc/source/rllib-env.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,21 @@ ARS **Yes** **Yes** No No

.. _`+parametric`: rllib-models.html#variable-length-parametric-action-spaces

You can pass either a string name or a Python class to specify an environment. By default, strings will be interpreted as a gym `environment name <https://gym.openai.com/envs>`__. Custom env classes must take a single ``env_config`` parameter in their constructor:
You can pass either a string name or a Python class to specify an environment. By default, strings will be interpreted as a gym `environment name <https://gym.openai.com/envs>`__. Custom env classes passed directly to the agent must take a single ``env_config`` parameter in their constructor:

.. code-block:: python

import ray
import gym, ray
from ray.rllib.agents import ppo

class MyEnv(gym.Env):
def __init__(self, env_config):
self.action_space = ...
self.observation_space = ...
...
self.action_space = <gym.Space>
self.observation_space = <gym.Space>
def reset(self):
return <obs>
def step(self, action):
return <obs>, <reward: float>, <done: bool>, <info: dict>

ray.init()
trainer = ppo.PPOAgent(env=MyEnv, config={
Expand Down
31 changes: 15 additions & 16 deletions python/ray/rllib/evaluation/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,12 +287,12 @@ def new_episode():

# Do batched policy eval
eval_results = _do_policy_eval(tf_sess, to_eval, policies,
active_episodes, clip_actions)
active_episodes)

# Process results and update episode state
actions_to_send = _process_policy_eval_results(
to_eval, eval_results, active_episodes, active_envs,
off_policy_actions)
off_policy_actions, policies, clip_actions)

# Return computed actions to ready envs. We also send to envs that have
# taken off-policy actions; those envs are free to ignore the action.
Expand Down Expand Up @@ -448,7 +448,7 @@ def _process_observations(async_vector_env, policies, batch_builder_pool,
return active_envs, to_eval, outputs


def _do_policy_eval(tf_sess, to_eval, policies, active_episodes, clip_actions):
def _do_policy_eval(tf_sess, to_eval, policies, active_episodes):
"""Call compute actions on observation batches to get next actions.

Returns:
Expand Down Expand Up @@ -483,18 +483,12 @@ def _do_policy_eval(tf_sess, to_eval, policies, active_episodes, clip_actions):
for k, v in pending_fetches.items():
eval_results[k] = builder.get(v)

if clip_actions:
for policy_id, results in eval_results.items():
policy = _get_or_raise(policies, policy_id)
actions, rnn_out_cols, pi_info_cols = results
eval_results[policy_id] = (_clip_actions(
actions, policy.action_space), rnn_out_cols, pi_info_cols)

return eval_results


def _process_policy_eval_results(to_eval, eval_results, active_episodes,
active_envs, off_policy_actions):
active_envs, off_policy_actions, policies,
clip_actions):
"""Process the output of policy neural network evaluation.

Records policy evaluation results into the given episode objects and
Expand All @@ -521,10 +515,15 @@ def _process_policy_eval_results(to_eval, eval_results, active_episodes,
pi_info_cols["state_out_{}".format(f_i)] = column
# Save output rows
actions = _unbatch_tuple_actions(actions)
policy = _get_or_raise(policies, policy_id)
for i, action in enumerate(actions):
env_id = eval_data[i].env_id
agent_id = eval_data[i].agent_id
actions_to_send[env_id][agent_id] = action
if clip_actions:
actions_to_send[env_id][agent_id] = _clip_actions(
action, policy.action_space)
else:
actions_to_send[env_id][agent_id] = action
episode = active_episodes[env_id]
episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols])
episode._set_last_pi_info(
Expand Down Expand Up @@ -562,7 +561,7 @@ def _clip_actions(actions, space):
"""Called to clip actions to the specified range of this policy.

Arguments:
actions: Batch of actions or TupleActions.
actions: Single action.
space: Action space the actions should be present in.

Returns:
Expand All @@ -572,13 +571,13 @@ def _clip_actions(actions, space):
if isinstance(space, gym.spaces.Box):
return np.clip(actions, space.low, space.high)
elif isinstance(space, gym.spaces.Tuple):
if not isinstance(actions, TupleActions):
if type(actions) not in (tuple, list):
raise ValueError("Expected tuple space for actions {}: {}".format(
actions, space))
out = []
for a, s in zip(actions.batches, space.spaces):
for a, s in zip(actions, space.spaces):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this change do?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not a batch anymore.

out.append(_clip_actions(a, s))
return TupleActions(out)
return out
else:
return actions

Expand Down
42 changes: 42 additions & 0 deletions python/ray/rllib/test/multiagent_pendulum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Integration test: (1) pendulum works, (2) single-agent multi-agent works."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import ray
from ray.rllib.test.test_multi_agent_env import make_multiagent
from ray.tune import run_experiments
from ray.tune.registry import register_env

if __name__ == "__main__":
ray.init()
MultiPendulum = make_multiagent("Pendulum-v0")
register_env("multi_pend", lambda _: MultiPendulum(1))
trials = run_experiments({
"test": {
"run": "PPO",
"env": "multi_pend",
"stop": {
"timesteps_total": 500000,
"episode_reward_mean": -200,
},
"config": {
"train_batch_size": 2048,
"vf_clip_param": 10.0,
"num_workers": 0,
"num_envs_per_worker": 10,
"lambda": 0.1,
"gamma": 0.95,
"lr": 0.0003,
"sgd_minibatch_size": 64,
"num_sgd_iter": 10,
"model": {
"fcnet_hiddens": [64, 64],
},
"batch_mode": "complete_episodes",
},
}
})
if trials[0].last_result["episode_reward_mean"] < -200:
raise ValueError("Did not get to -200 reward", trials[0].last_result)
3 changes: 2 additions & 1 deletion python/ray/rllib/tuned_examples/pendulum-ppo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ pendulum-ppo:
config:
train_batch_size: 2048
vf_clip_param: 10.0
num_workers: 2
num_workers: 0
num_envs_per_worker: 10
lambda: 0.1
gamma: 0.95
lr: 0.0003
Expand Down
8 changes: 8 additions & 0 deletions test/jenkins_tests/run_multi_node_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,14 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/python/ray/rllib/test/test_rollout.sh

# Try a couple times since it's stochastic
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
python /ray/python/ray/rllib/test/multiagent_pendulum.py || \
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
python /ray/python/ray/rllib/test/multiagent_pendulum.py || \
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
python /ray/python/ray/rllib/test/multiagent_pendulum.py

docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
python /ray/python/ray/tune/examples/tune_mnist_ray.py \
--smoke-test
Expand Down