Skip to content

Improve PPO algorithm #312

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jul 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .all-contributorsrc
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@
"contributions": [
"maintenance"
]
},
{
"login": "isk03276",
"name": "eunjin",
"avatar_url": "https://avatars.githubusercontent.com/u/23740495?v=4",
"profile": "https://github.com/isk03276",
"contributions": [
"code"
]
}
],
"contributorsPerLine": 7,
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/medipixel/rl_algorithms.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/medipixel/rl_algorithms/context:python)
[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)<!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
[![All Contributors](https://img.shields.io/badge/all_contributors-9-orange.svg?style=flat-square)](#contributors-)
[![All Contributors](https://img.shields.io/badge/all_contributors-10-orange.svg?style=flat-square)](#contributors-)
<!-- ALL-CONTRIBUTORS-BADGE:END -->

</p>
Expand Down Expand Up @@ -47,6 +47,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
<tr>
<td align="center"><a href="https://jiseonghan.github.io/"><img src="https://avatars2.githubusercontent.com/u/48741026?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Jiseong Han</b></sub></a><br /><a href="https://github.com/medipixel/rl_algorithms/commits?author=jiseongHAN" title="Code">💻</a></td>
<td align="center"><a href="https://github.com/sehyun-hwang"><img src="https://avatars3.githubusercontent.com/u/23437715?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Sehyun Hwang</b></sub></a><br /><a href="#maintenance-sehyun-hwang" title="Maintenance">🚧</a></td>
<td align="center"><a href="https://github.com/isk03276"><img src="https://avatars.githubusercontent.com/u/23740495?v=4?s=100" width="100px;" alt=""/><br /><sub><b>eunjin</b></sub></a><br /><a href="https://github.com/medipixel/rl_algorithms/commits?author=isk03276" title="Code">💻</a></td>
</tr>
</table>

Expand Down
2 changes: 2 additions & 0 deletions configs/lunarlander_continuous_v2/a2c.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ learner_cfg:
backbone:
actor:
critic:
shared_actor_critic:
head:
actor:
type: "GaussianDist"
configs:
hidden_sizes: [256, 256]
output_activation: "identity"
fixed_logstd: True
critic:
type: "MLP"
configs:
Expand Down
1 change: 1 addition & 0 deletions configs/lunarlander_continuous_v2/bc_ddpg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ learner_cfg:
backbone:
actor:
critic:
shared_actor_critic:
head:
actor:
type: "MLP"
Expand Down
2 changes: 2 additions & 0 deletions configs/lunarlander_continuous_v2/bc_sac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@ learner_cfg:
actor:
critic_vf:
critic_qf:
shared_actor_critic:
head:
actor:
type: "TanhGaussianDistParams"
configs:
hidden_sizes: [256, 256]
output_activation: "identity"
fixed_logstd: False
critic_vf:
type: "MLP"
configs:
Expand Down
1 change: 1 addition & 0 deletions configs/lunarlander_continuous_v2/ddpg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ learner_cfg:
backbone:
actor:
critic:
shared_actor_critic:
head:
actor:
type: "MLP"
Expand Down
1 change: 1 addition & 0 deletions configs/lunarlander_continuous_v2/ddpgfd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ learner_cfg:
backbone:
actor:
critic:
shared_actor_critic:
head:
actor:
type: "MLP"
Expand Down
11 changes: 6 additions & 5 deletions configs/lunarlander_continuous_v2/ppo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,27 @@ hyper_params:
epsilon_decay_period: 1500
w_value: 1.0
w_entropy: 0.001
gradient_clip_ac: 1.0
gradient_clip_cr: 0.5
gradient_clip_ac: 0.5
gradient_clip_cr: 1.0
epoch: 16
rollout_len: 256
n_workers: 12
use_clipped_value_loss: True
use_clipped_value_loss: False
standardize_advantage: True
is_discrete: False

learner_cfg:
type: "PPOLearner"
backbone:
actor:
critic:
shared_actor_critic:
head:
actor:
type: "GaussianDist"
configs:
hidden_sizes: [256, 256]
output_activation: "tanh"
output_activation: "identity"
fixed_logstd: True
critic:
type: "MLP"
configs:
Expand Down
2 changes: 2 additions & 0 deletions configs/lunarlander_continuous_v2/sac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ learner_cfg:
actor:
critic_vf:
critic_qf:
shared_actor_critic:
head:
actor:
type: "TanhGaussianDistParams"
configs:
hidden_sizes: [256, 256]
output_activation: "identity"
fixed_logstd: False
critic_vf:
type: "MLP"
configs:
Expand Down
2 changes: 2 additions & 0 deletions configs/lunarlander_continuous_v2/sacfd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@ learner_cfg:
actor:
critic_vf:
critic_qf:
shared_actor_critic:
head:
actor:
type: "TanhGaussianDistParams"
configs:
hidden_sizes: [256, 256]
output_activation: "identity"
fixed_logstd: False
critic_vf:
type: "MLP"
configs:
Expand Down
1 change: 1 addition & 0 deletions configs/lunarlander_continuous_v2/td3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ learner_cfg:
backbone:
actor:
critic:
shared_actor_critic:
head:
actor:
type: "MLP"
Expand Down
40 changes: 40 additions & 0 deletions configs/lunarlander_v2/ppo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
type: "PPOAgent"
hyper_params:
gamma: 0.99
tau: 0.95
batch_size: 32
max_epsilon: 0.2
min_epsilon: 0.2
epsilon_decay_period: 1500
w_value: 1.0
w_entropy: 0.001
gradient_clip_ac: 0.5
gradient_clip_cr: 1.0
epoch: 16
rollout_len: 256
n_workers: 12
use_clipped_value_loss: False
standardize_advantage: True

learner_cfg:
type: "PPOLearner"
backbone:
actor:
critic:
shared_actor_critic:
head:
actor:
type: "CategoricalDist"
configs:
hidden_sizes: [256, 256]
output_activation: "identity"
critic:
type: "MLP"
configs:
hidden_sizes: [256, 256]
output_size: 1
output_activation: "identity"
optim_cfg:
lr_actor: 0.0003
lr_critic: 0.001
weight_decay: 0.0
47 changes: 47 additions & 0 deletions configs/pong_no_frameskip_v4/ppo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
type: "PPOAgent"
hyper_params:
gamma: 0.99
tau: 0.95
batch_size: 32
max_epsilon: 0.2
min_epsilon: 0.2
epsilon_decay_period: 1500
w_value: 1.0
w_entropy: 0.001
gradient_clip_ac: 0.5
gradient_clip_cr: 1.0
epoch: 16
rollout_len: 256
n_workers: 4
use_clipped_value_loss: False
standardize_advantage: True

learner_cfg:
type: "PPOLearner"
backbone:
actor:
critic:
shared_actor_critic:
type: "CNN"
configs:
input_sizes: [4, 32, 64]
output_sizes: [32, 64, 64]
kernel_sizes: [8, 4, 3]
strides: [4, 2, 1]
paddings: [1, 0, 0]
head:
actor:
type: "CategoricalDist"
configs:
hidden_sizes: [512]
output_activation: "identity"
critic:
type: "MLP"
configs:
hidden_sizes: [512]
output_size: 1
output_activation: "identity"
optim_cfg:
lr_actor: 0.0003
lr_critic: 0.001
weight_decay: 0.0
2 changes: 2 additions & 0 deletions rl_algorithms/common/abstract/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ def __init__(
self.total_step = 0
self.learner = None

self.is_discrete = isinstance(self.env_info.action_space, gym.spaces.Discrete)

@abstractmethod
def select_action(self, state: np.ndarray) -> Union[torch.Tensor, np.ndarray]:
pass
Expand Down
8 changes: 7 additions & 1 deletion rl_algorithms/common/networks/brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,16 @@ def __init__(
self,
backbone_cfg: ConfigDict,
head_cfg: ConfigDict,
shared_backbone: nn.Module = None,
):
"""Initialize."""
nn.Module.__init__(self)
if not backbone_cfg:
if shared_backbone is not None:
self.backbone = shared_backbone
head_cfg.configs.input_size = self.calculate_fc_input_size(
head_cfg.configs.state_size
)
elif not backbone_cfg:
self.backbone = identity
head_cfg.configs.input_size = head_cfg.configs.state_size[0]
else:
Expand Down
53 changes: 45 additions & 8 deletions rl_algorithms/common/networks/heads.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import Callable, Tuple

import torch
from torch.distributions import Normal
from torch.distributions import Categorical, Normal
import torch.nn as nn
import torch.nn.functional as F

Expand Down Expand Up @@ -127,10 +127,15 @@ def __init__(
self.log_std_min = log_std_min
self.log_std_max = log_std_max
in_size = configs.hidden_sizes[-1]
self.fixed_logstd = configs.fixed_logstd

# set log_std layer
self.log_std_layer = nn.Linear(in_size, configs.output_size)
self.log_std_layer = init_fn(self.log_std_layer)
# set log_std
if self.fixed_logstd:
log_std = -0.5 * torch.ones(self.output_size, dtype=torch.float32)
self.log_std = torch.nn.Parameter(log_std)
else:
self.log_std_layer = nn.Linear(in_size, configs.output_size)
self.log_std_layer = init_fn(self.log_std_layer)

# set mean layer
self.mu_layer = nn.Linear(in_size, configs.output_size)
Expand All @@ -144,10 +149,13 @@ def get_dist_params(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]:
mu = self.mu_activation(self.mu_layer(hidden))

# get std
log_std = torch.tanh(self.log_std_layer(hidden))
log_std = self.log_std_min + 0.5 * (self.log_std_max - self.log_std_min) * (
log_std + 1
)
if self.fixed_logstd:
log_std = self.log_std
else:
log_std = torch.tanh(self.log_std_layer(hidden))
log_std = self.log_std_min + 0.5 * (self.log_std_max - self.log_std_min) * (
log_std + 1
)
std = torch.exp(log_std)

return mu, log_std, std
Expand Down Expand Up @@ -190,3 +198,32 @@ def forward(
log_prob = log_prob.sum(-1, keepdim=True)

return action, log_prob, z, mu, std


# TODO: Remove it when upgrade torch>=1.7
# pylint: disable=abstract-method
@HEADS.register_module
class CategoricalDist(MLP):
"""Multilayer perceptron with Categorical distribution output."""

def __init__(
self,
configs: ConfigDict,
hidden_activation: Callable = F.relu,
):
"""Initialize."""
super().__init__(
configs=configs,
hidden_activation=hidden_activation,
use_output_layer=True,
)

def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]:
"""Forward method implementation."""
ac_logits = super().forward(x)

# get categorical distribution and action
dist = Categorical(logits=ac_logits)
action = dist.sample()

return action, dist
Loading