Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
f46c8bb
fixed style issues
alvkao58 Feb 1, 2018
aa13d9c
removed gae, filter, clipping, value estimator for simplification pur…
alvkao58 Feb 1, 2018
a6809f7
added test entry in test_supported_spaces; minor fixes
alvkao58 Feb 1, 2018
391afb2
added description of PGAgent
alvkao58 Feb 1, 2018
39926b1
minor cosmetic changes
alvkao58 Feb 1, 2018
dd12882
eliminated several unnecessary parts of code
alvkao58 Feb 2, 2018
8fce9fc
added jenkins tests, horizon
alvkao58 Feb 8, 2018
38a5720
initial commit for ddpg
alvkao58 Mar 1, 2018
dbfd1db
safety commit before restructuring models
alvkao58 Mar 5, 2018
f2cf37a
fixed network, need to move critic and actor back into separate classes
alvkao58 Mar 5, 2018
09d7221
did a lot of restructuring; first version that runs
alvkao58 Mar 8, 2018
b356a8e
added target update
alvkao58 Mar 8, 2018
c8b9ffc
switched to using SyncLocalReplayOptimizer and pre-existing replay bu…
alvkao58 Mar 14, 2018
ee07ab4
fixed some algorithmic errors, cleaned up code
alvkao58 Mar 22, 2018
bbcfe3a
added fix to actor gradients
alvkao58 Mar 22, 2018
fdf5c29
richard
richardliaw Mar 22, 2018
fc9fbba
some small fixes
richardliaw Mar 22, 2018
6bd2880
updated stats to match training process, added option for parameter s…
alvkao58 Mar 30, 2018
391ea14
nit changes
richardliaw Mar 31, 2018
395cf35
updated actor and critic networks; now learns on Pendulum
alvkao58 Apr 1, 2018
cf3e2fe
style fixes
alvkao58 Apr 1, 2018
66d7f6f
switching from tflearn to slim, making sampler consistent
alvkao58 Apr 2, 2018
6964231
nits
richardliaw Apr 2, 2018
2eaa114
more fixes to actor/critic network
alvkao58 Apr 4, 2018
ae208c5
moved tensorflow-specific stuff out of the evaluator
alvkao58 Apr 4, 2018
d5f9aca
fixed getting/setting weights
alvkao58 Apr 4, 2018
995adc3
formatting fixes
alvkao58 Apr 4, 2018
667b597
added descrptions to config items, moved noise process parameters int…
alvkao58 Apr 4, 2018
870ea84
updated stats collecting
alvkao58 Apr 4, 2018
2f24434
more clean up
alvkao58 Apr 4, 2018
9c9b61b
changed stats to support remote evaluators
alvkao58 Apr 4, 2018
fc7b9d3
made requested formatting changes
alvkao58 Apr 5, 2018
a759eb2
moved actor, critic networks into model directory
alvkao58 Apr 6, 2018
90fad1e
Merge branch 'master' into ddpg
richardliaw Apr 9, 2018
1d76eac
fix from merging
alvkao58 Apr 9, 2018
e5d0649
added back changes that were removed when fixing rebasing
alvkao58 Apr 10, 2018
aab9570
made requested touch ups
alvkao58 Apr 11, 2018
831c333
fix test syntax
alvkao58 Apr 11, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/ray/rllib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@


def _register_all():
for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "__fake",
"__sigmoid_fake_data", "__parameter_tuning"]:
for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
"__fake", "__sigmoid_fake_data", "__parameter_tuning"]:
from ray.rllib.agent import get_agent_class
register_trainable(key, get_agent_class(key))

Expand Down
2 changes: 1 addition & 1 deletion python/ray/rllib/a3c/shared_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _setup_graph(self, ob_space, ac_space):
def compute_gradients(self, samples):
info = {}
feed_dict = {
self.x: samples["observations"],
self.x: samples["obs"],
self.ac: samples["actions"],
self.adv: samples["advantages"],
self.r: samples["value_targets"],
Expand Down
2 changes: 1 addition & 1 deletion python/ray/rllib/a3c/shared_model_lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def compute_gradients(self, samples):
"""
features = samples["features"][0]
feed_dict = {
self.x: samples["observations"],
self.x: samples["obs"],
self.ac: samples["actions"],
self.adv: samples["advantages"],
self.r: samples["value_targets"],
Expand Down
3 changes: 3 additions & 0 deletions python/ray/rllib/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ def get_agent_class(alg):
elif alg == "PG":
from ray.rllib import pg
return pg.PGAgent
elif alg == "DDPG":
from ray.rllib import ddpg
return ddpg.DDPGAgent
Copy link
Contributor

@ericl ericl Mar 9, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. It would be good to also

  • make an issue to update the docs; there are a couple new algs not documented by now
  • add a DDPG example to the regression tests folder (tuned_examples/regression_tests)
  • add a DDPG sanity check to multi_node_tests.sh

Copy link
Contributor

@richardliaw richardliaw Apr 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alvkao58 can you take care of these comments by Eric? (see PR)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any particular sanity checks you want to see added to multi_node_tests.sh?

elif alg == "script":
from ray.tune import script_runner
return script_runner.ScriptRunner
Expand Down
3 changes: 3 additions & 0 deletions python/ray/rllib/ddpg/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ray.rllib.ddpg.ddpg import DDPGAgent, DEFAULT_CONFIG

__all__ = ["DDPGAgent", "DEFAULT_CONFIG"]
112 changes: 112 additions & 0 deletions python/ray/rllib/ddpg/ddpg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

import ray
from ray.rllib.agent import Agent
from ray.rllib.ddpg.ddpg_evaluator import DDPGEvaluator, RemoteDDPGEvaluator
from ray.rllib.optimizers import LocalSyncReplayOptimizer
from ray.tune.result import TrainingResult

DEFAULT_CONFIG = {
# Actor learning rate
"actor_lr": 0.0001,
# Critic learning rate
"critic_lr": 0.001,
# Arguments to pass in to env creator
"env_config": {},
# MDP Discount factor
"gamma": 0.99,
# Number of steps after which the rollout gets cut
"horizon": 500,

# Whether to include parameter noise
"noise_add": True,
# Linear decay of exploration policy
"noise_epsilon": 0.0002,
# Parameters for noise process
"noise_parameters": {
"mu": 0,
"sigma": 0.2,
"theta": 0.15,
},

# Number of local steps taken for each call to sample
"num_local_steps": 1,
# Number of workers (excluding master)
"num_workers": 0,

"optimizer": {
# Replay buffer size
"buffer_size": 10000,
# Number of steps in warm-up phase before learning starts
"learning_starts": 500,
# Whether to clip rewards
"clip_rewards": False,
# Whether to use prioritized replay
"prioritized_replay": False,
# Size of batch sampled from replay buffer
"train_batch_size": 64,
},

# Controls how fast target networks move
"tau": 0.001,
# Number of steps taken per training iteration
"train_steps": 600,
}


class DDPGAgent(Agent):
_agent_name = "DDPG"
_default_config = DEFAULT_CONFIG

def _init(self):
self.local_evaluator = DDPGEvaluator(
self.registry, self.env_creator, self.config)
self.remote_evaluators = [
RemoteDDPGEvaluator.remote(
self.registry, self.env_creator, self.config)
for _ in range(self.config["num_workers"])]
self.optimizer = LocalSyncReplayOptimizer(
self.config["optimizer"], self.local_evaluator,
self.remote_evaluators)

def _train(self):
for _ in range(self.config["train_steps"]):
self.optimizer.step()
# update target
if self.optimizer.num_steps_trained > 0:
self.local_evaluator.update_target()

# generate training result
return self._fetch_metrics()

def _fetch_metrics(self):
episode_rewards = []
episode_lengths = []
if self.config["num_workers"] > 0:
metric_lists = [a.get_completed_rollout_metrics.remote()
for a in self.remote_evaluators]
for metrics in metric_lists:
for episode in ray.get(metrics):
episode_lengths.append(episode.episode_length)
episode_rewards.append(episode.episode_reward)
else:
metrics = self.local_evaluator.get_completed_rollout_metrics()
for episode in metrics:
episode_lengths.append(episode.episode_length)
episode_rewards.append(episode.episode_reward)

avg_reward = (np.mean(episode_rewards))
avg_length = (np.mean(episode_lengths))
timesteps = np.sum(episode_lengths)

result = TrainingResult(
episode_reward_mean=avg_reward,
episode_len_mean=avg_length,
timesteps_this_iter=timesteps,
info={})

return result
75 changes: 75 additions & 0 deletions python/ray/rllib/ddpg/ddpg_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

import ray
from ray.rllib.ddpg.models import DDPGModel
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.optimizers import PolicyEvaluator
from ray.rllib.utils.filter import NoFilter
from ray.rllib.utils.process_rollout import process_rollout
from ray.rllib.utils.sampler import SyncSampler


class DDPGEvaluator(PolicyEvaluator):

def __init__(self, registry, env_creator, config):
self.env = ModelCatalog.get_preprocessor_as_wrapper(
registry, env_creator(config["env_config"]))

# contains model, target_model
self.model = DDPGModel(registry, self.env, config)

self.sampler = SyncSampler(
self.env, self.model.model, NoFilter(),
config["num_local_steps"], horizon=config["horizon"])

def sample(self):
"""Returns a batch of samples."""

rollout = self.sampler.get_data()
rollout.data["weights"] = np.ones_like(rollout.data["rewards"])

# since each sample is one step, no discounting needs to be applied;
# this does not involve config["gamma"]
samples = process_rollout(
rollout, NoFilter(),
gamma=1.0, use_gae=False)

return samples

def update_target(self):
"""Updates target critic and target actor."""
self.model.update_target()

def compute_gradients(self, samples):
"""Returns critic, actor gradients."""
return self.model.compute_gradients(samples)

def apply_gradients(self, grads):
"""Applies gradients to evaluator weights."""
self.model.apply_gradients(grads)

def compute_apply(self, samples):
grads, _ = self.compute_gradients(samples)
self.apply_gradients(grads)

def get_weights(self):
"""Returns model weights."""
return self.model.get_weights()

def set_weights(self, weights):
"""Sets model weights."""
self.model.set_weights(weights)

def get_completed_rollout_metrics(self):
"""Returns metrics on previously completed rollouts.

Calling this clears the queue of completed rollout metrics.
"""
return self.sampler.get_metrics()


RemoteDDPGEvaluator = ray.remote(DDPGEvaluator)
Loading