Improve PPO algorithm (#312)

isk03276 · allcontributors[bot] · web-flow · commit 3d106e537603 · 2021-07-09T10:05:47.000+09:00
* [IBR-2068] Modify standard deviation of gaussian action in ppo * [IBR-2068] Add ppo algorithm for discrete action * [IBR-2068] Add shared backbone for actor critic * [IBR-2068] Fix gpu oom bug * [IBR-2068] Tuning hyper-parameters for ppo * [IBR-2068] Modify multi env * [IBR-2068] Modify learner for shared actor critic * [IBR-2068] Rollback ppo config * [IBR-2068] Add ppo with discrete action * [IBR-2068]Remove retain_graph option * docs: add isk03276 as a contributor for code (#314) * docs: update README.md [skip ci] * docs: update .all-contributorsrc [skip ci] Co-authored-by: allcontributors[bot] <46447321+allcontributors[bot]@users.noreply.github.com> Co-authored-by: allcontributors[bot] <46447321+allcontributors[bot]@users.noreply.github.com>
diff --git a/.all-contributorsrc b/.all-contributorsrc
@@ -85,6 +85,15 @@
       "contributions": [
         "maintenance"
       ]
+    },
+    {
+      "login": "isk03276",
+      "name": "eunjin",
+      "avatar_url": "https://avatars.githubusercontent.com/u/23740495?v=4",
+      "profile": "https://github.com/isk03276",
+      "contributions": [
+        "code"
+      ]
     }
   ],
   "contributorsPerLine": 7,
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/medipixel/rl_algorithms.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/medipixel/rl_algorithms/context:python)
 [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)<!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
-[![All Contributors](https://img.shields.io/badge/all_contributors-9-orange.svg?style=flat-square)](#contributors-)
+[![All Contributors](https://img.shields.io/badge/all_contributors-10-orange.svg?style=flat-square)](#contributors-)
 <!-- ALL-CONTRIBUTORS-BADGE:END -->
 
 </p>
@@ -47,6 +47,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
   <tr>
     <td align="center"><a href="https://jiseonghan.github.io/"><img src="https://avatars2.githubusercontent.com/u/48741026?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Jiseong Han</b></sub></a><br /><a href="https://github.com/medipixel/rl_algorithms/commits?author=jiseongHAN" title="Code">💻</a></td>
     <td align="center"><a href="https://github.com/sehyun-hwang"><img src="https://avatars3.githubusercontent.com/u/23437715?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Sehyun Hwang</b></sub></a><br /><a href="#maintenance-sehyun-hwang" title="Maintenance">🚧</a></td>
+    <td align="center"><a href="https://github.com/isk03276"><img src="https://avatars.githubusercontent.com/u/23740495?v=4?s=100" width="100px;" alt=""/><br /><sub><b>eunjin</b></sub></a><br /><a href="https://github.com/medipixel/rl_algorithms/commits?author=isk03276" title="Code">💻</a></td>
   </tr>
 </table>
 
diff --git a/configs/lunarlander_continuous_v2/a2c.yaml b/configs/lunarlander_continuous_v2/a2c.yaml
@@ -10,12 +10,14 @@ learner_cfg:
   backbone:
     actor:
     critic:
+    shared_actor_critic:
   head:
     actor:
       type: "GaussianDist"
       configs: 
         hidden_sizes: [256, 256]
         output_activation: "identity"
+        fixed_logstd: True
     critic:
       type: "MLP"
       configs:
diff --git a/configs/lunarlander_continuous_v2/bc_ddpg.yaml b/configs/lunarlander_continuous_v2/bc_ddpg.yaml
@@ -24,6 +24,7 @@ learner_cfg:
   backbone:
     actor:
     critic:
+    shared_actor_critic:
   head:
     actor:
       type: "MLP"
diff --git a/configs/lunarlander_continuous_v2/bc_sac.yaml b/configs/lunarlander_continuous_v2/bc_sac.yaml
@@ -29,12 +29,14 @@ learner_cfg:
     actor:
     critic_vf:
     critic_qf:
+    shared_actor_critic:
   head:
     actor:
       type: "TanhGaussianDistParams"
       configs: 
         hidden_sizes: [256, 256]
         output_activation: "identity"
+        fixed_logstd: False
     critic_vf:
       type: "MLP"
       configs:
diff --git a/configs/lunarlander_continuous_v2/ddpg.yaml b/configs/lunarlander_continuous_v2/ddpg.yaml
@@ -14,6 +14,7 @@ learner_cfg:
   backbone:
     actor:
     critic:
+    shared_actor_critic:
   head:
     actor:
       type: "MLP"
diff --git a/configs/lunarlander_continuous_v2/ddpgfd.yaml b/configs/lunarlander_continuous_v2/ddpgfd.yaml
@@ -25,6 +25,7 @@ learner_cfg:
   backbone:
     actor:
     critic:
+    shared_actor_critic:
   head:
     actor:
       type: "MLP"
diff --git a/configs/lunarlander_continuous_v2/ppo.yaml b/configs/lunarlander_continuous_v2/ppo.yaml
@@ -8,26 +8,27 @@ hyper_params:
   epsilon_decay_period: 1500
   w_value: 1.0
   w_entropy: 0.001
-  gradient_clip_ac: 1.0
-  gradient_clip_cr: 0.5
+  gradient_clip_ac: 0.5
+  gradient_clip_cr: 1.0
   epoch: 16
   rollout_len: 256
   n_workers: 12
-  use_clipped_value_loss: True
+  use_clipped_value_loss: False
   standardize_advantage: True
-  is_discrete: False
 
 learner_cfg:
   type: "PPOLearner"
   backbone:
     actor:
     critic:
+    shared_actor_critic:
   head:
     actor:
       type: "GaussianDist"
       configs: 
         hidden_sizes: [256, 256]
-        output_activation: "tanh"
+        output_activation: "identity"
+        fixed_logstd: True
     critic:
       type: "MLP"
       configs:
diff --git a/configs/lunarlander_continuous_v2/sac.yaml b/configs/lunarlander_continuous_v2/sac.yaml
@@ -19,12 +19,14 @@ learner_cfg:
     actor:
     critic_vf:
     critic_qf:
+    shared_actor_critic:
   head:
     actor:
       type: "TanhGaussianDistParams"
       configs: 
         hidden_sizes: [256, 256]
         output_activation: "identity"
+        fixed_logstd: False
     critic_vf:
       type: "MLP"
       configs:
diff --git a/configs/lunarlander_continuous_v2/sacfd.yaml b/configs/lunarlander_continuous_v2/sacfd.yaml
@@ -30,12 +30,14 @@ learner_cfg:
     actor:
     critic_vf:
     critic_qf:
+    shared_actor_critic:
   head:
     actor:
       type: "TanhGaussianDistParams"
       configs: 
         hidden_sizes: [256, 256]
         output_activation: "identity"
+        fixed_logstd: False
     critic_vf:
       type: "MLP"
       configs:
diff --git a/configs/lunarlander_continuous_v2/td3.yaml b/configs/lunarlander_continuous_v2/td3.yaml
@@ -12,6 +12,7 @@ learner_cfg:
   backbone:
     actor:
     critic:
+    shared_actor_critic:
   head:
     actor:
       type: "MLP"
diff --git a/configs/lunarlander_v2/ppo.yaml b/configs/lunarlander_v2/ppo.yaml
@@ -0,0 +1,40 @@
+type: "PPOAgent"
+hyper_params:
+  gamma: 0.99
+  tau: 0.95
+  batch_size: 32
+  max_epsilon: 0.2
+  min_epsilon: 0.2
+  epsilon_decay_period: 1500
+  w_value: 1.0
+  w_entropy: 0.001
+  gradient_clip_ac: 0.5
+  gradient_clip_cr: 1.0
+  epoch: 16
+  rollout_len: 256
+  n_workers: 12
+  use_clipped_value_loss: False
+  standardize_advantage: True
+
+learner_cfg:
+  type: "PPOLearner"
+  backbone:
+    actor:
+    critic:
+    shared_actor_critic:
+  head:
+    actor:
+      type: "CategoricalDist"
+      configs: 
+        hidden_sizes: [256, 256]
+        output_activation: "identity"
+    critic:
+      type: "MLP"
+      configs:
+        hidden_sizes: [256, 256]
+        output_size: 1
+        output_activation: "identity"
+  optim_cfg:
+    lr_actor: 0.0003
+    lr_critic: 0.001
+    weight_decay: 0.0
diff --git a/configs/pong_no_frameskip_v4/ppo.yaml b/configs/pong_no_frameskip_v4/ppo.yaml
@@ -0,0 +1,47 @@
+type: "PPOAgent"
+hyper_params:
+  gamma: 0.99
+  tau: 0.95
+  batch_size: 32
+  max_epsilon: 0.2
+  min_epsilon: 0.2
+  epsilon_decay_period: 1500
+  w_value: 1.0
+  w_entropy: 0.001
+  gradient_clip_ac: 0.5
+  gradient_clip_cr: 1.0
+  epoch: 16
+  rollout_len: 256
+  n_workers: 4
+  use_clipped_value_loss: False
+  standardize_advantage: True
+
+learner_cfg:
+  type: "PPOLearner"
+  backbone:
+    actor:
+    critic:
+    shared_actor_critic:
+      type: "CNN"
+      configs:
+        input_sizes: [4, 32, 64]
+        output_sizes: [32, 64, 64]
+        kernel_sizes: [8, 4, 3]
+        strides: [4, 2, 1]
+        paddings: [1, 0, 0]
+  head:
+    actor:
+      type: "CategoricalDist"
+      configs: 
+        hidden_sizes: [512]
+        output_activation: "identity"
+    critic:
+      type: "MLP"
+      configs:
+        hidden_sizes: [512]
+        output_size: 1
+        output_activation: "identity"
+  optim_cfg:
+    lr_actor: 0.0003
+    lr_critic: 0.001
+    weight_decay: 0.0
diff --git a/rl_algorithms/common/abstract/agent.py b/rl_algorithms/common/abstract/agent.py
@@ -66,6 +66,8 @@ def __init__(
         self.total_step = 0
         self.learner = None
 
+        self.is_discrete = isinstance(self.env_info.action_space, gym.spaces.Discrete)
+
     @abstractmethod
     def select_action(self, state: np.ndarray) -> Union[torch.Tensor, np.ndarray]:
         pass
diff --git a/rl_algorithms/common/networks/brain.py b/rl_algorithms/common/networks/brain.py
@@ -27,10 +27,16 @@ def __init__(
         self,
         backbone_cfg: ConfigDict,
         head_cfg: ConfigDict,
+        shared_backbone: nn.Module = None,
     ):
         """Initialize."""
         nn.Module.__init__(self)
-        if not backbone_cfg:
+        if shared_backbone is not None:
+            self.backbone = shared_backbone
+            head_cfg.configs.input_size = self.calculate_fc_input_size(
+                head_cfg.configs.state_size
+            )
+        elif not backbone_cfg:
             self.backbone = identity
             head_cfg.configs.input_size = head_cfg.configs.state_size[0]
         else:
diff --git a/rl_algorithms/common/networks/heads.py b/rl_algorithms/common/networks/heads.py
@@ -8,7 +8,7 @@
 from typing import Callable, Tuple
 
 import torch
-from torch.distributions import Normal
+from torch.distributions import Categorical, Normal
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -127,10 +127,15 @@ def __init__(
         self.log_std_min = log_std_min
         self.log_std_max = log_std_max
         in_size = configs.hidden_sizes[-1]
+        self.fixed_logstd = configs.fixed_logstd
 
-        # set log_std layer
-        self.log_std_layer = nn.Linear(in_size, configs.output_size)
-        self.log_std_layer = init_fn(self.log_std_layer)
+        # set log_std
+        if self.fixed_logstd:
+            log_std = -0.5 * torch.ones(self.output_size, dtype=torch.float32)
+            self.log_std = torch.nn.Parameter(log_std)
+        else:
+            self.log_std_layer = nn.Linear(in_size, configs.output_size)
+            self.log_std_layer = init_fn(self.log_std_layer)
 
         # set mean layer
         self.mu_layer = nn.Linear(in_size, configs.output_size)
@@ -144,10 +149,13 @@ def get_dist_params(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]:
         mu = self.mu_activation(self.mu_layer(hidden))
 
         # get std
-        log_std = torch.tanh(self.log_std_layer(hidden))
-        log_std = self.log_std_min + 0.5 * (self.log_std_max - self.log_std_min) * (
-            log_std + 1
-        )
+        if self.fixed_logstd:
+            log_std = self.log_std
+        else:
+            log_std = torch.tanh(self.log_std_layer(hidden))
+            log_std = self.log_std_min + 0.5 * (self.log_std_max - self.log_std_min) * (
+                log_std + 1
+            )
         std = torch.exp(log_std)
 
         return mu, log_std, std
@@ -190,3 +198,32 @@ def forward(
         log_prob = log_prob.sum(-1, keepdim=True)
 
         return action, log_prob, z, mu, std
+
+
+# TODO: Remove it when upgrade torch>=1.7
+# pylint: disable=abstract-method
+@HEADS.register_module
+class CategoricalDist(MLP):
+    """Multilayer perceptron with Categorical distribution output."""
+
+    def __init__(
+        self,
+        configs: ConfigDict,
+        hidden_activation: Callable = F.relu,
+    ):
+        """Initialize."""
+        super().__init__(
+            configs=configs,
+            hidden_activation=hidden_activation,
+            use_output_layer=True,
+        )
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]:
+        """Forward method implementation."""
+        ac_logits = super().forward(x)
+
+        # get categorical distribution and action
+        dist = Categorical(logits=ac_logits)
+        action = dist.sample()
+
+        return action, dist
diff --git a/rl_algorithms/ppo/agent.py b/rl_algorithms/ppo/agent.py
diff --git a/rl_algorithms/ppo/learner.py b/rl_algorithms/ppo/learner.py
diff --git a/run_pong_no_frameskip_v4.py b/run_pong_no_frameskip_v4.py