diff --git a/README.md b/README.md
index b5c92897..900033a2 100644
--- a/README.md
+++ b/README.md
@@ -30,23 +30,7 @@
 | [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) |                                                              |                                                              |
 ## 算法实战
 
-|                 算法名称                 |                        相关论文材料                         | 环境                                  |                备注                |
-| :--------------------------------------: | :---------------------------------------------------------: | ------------------------------------- | :--------------------------------: |
-| [On-Policy First-Visit MC](./codes/MonteCarlo) |                                                             | [Racetrack](./codes/envs/racetrack_env.md)  |                                    |
-|        [Q-Learning](./codes/QLearning)         |                                                             | [CliffWalking-v0](./codes/envs/gym_info.md) |                                    |
-|             [Sarsa](./codes/Sarsa)             |                                                             | [Racetrack](./codes/envs/racetrack_env.md)  |                                    |
-|               [DQN](./codes/DQN)               | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./codes/envs/gym_info.md)     |                                    |
-|                 DQN-cnn                  | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./codes/envs/gym_info.md)     | 与DQN相比使用了CNN而不是全链接网络 |
-|         [DoubleDQN](./codes/DoubleDQN)         |                                                             | [CartPole-v0](./codes/envs/gym_info.md)     |          效果不好，待改进          |
-|             Hierarchical DQN             |    [Hierarchical DQN](https://arxiv.org/abs/1604.06057)     |                                       |                                    |
-|    [PolicyGradient](./codes/PolicyGradient)    |                                                             | [CartPole-v0](./codes/envs/gym_info.md)     |                                    |
-|                   A2C                    |                                                             | [CartPole-v0](./codes/envs/gym_info.md)     |                                    |
-|                   A3C                    |                                                             |                                       |                                    |
-|                   SAC                    |                                                             |                                       |                                    |
-|               [PPO](./codes/PPO)               |        [PPO paper](https://arxiv.org/abs/1707.06347)        | [CartPole-v0](./codes/envs/gym_info.md)     |                                    |
-|                   DDPG                   |       [DDPG Paper](https://arxiv.org/abs/1509.02971)        | [Pendulum-v0](./codes/envs/gym_info.md)     |                                    |
-|                   TD3                    | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) |                                       |                                    |
-|                   GAIL                   |                                                             |                                       |                                    |
+[点击](./codes)或者跳转```codes```文件夹下进入算法实战
 
 ## 贡献者
 
diff --git a/codes/A2C/agent.py b/codes/A2C/agent.py
index af1201bb..aafe7c1c 100644
--- a/codes/A2C/agent.py
+++ b/codes/A2C/agent.py
@@ -13,9 +13,9 @@
 import torch.optim as optim
 
 class A2C:
-    def __init__(self,n_states, n_actions, cfg):
+    def __init__(self,state_dim, action_dim, cfg):
         self.gamma = 0.99
-        self.model = ActorCritic(n_states, n_actions, hidden_dim=cfg.hidden_dim).to(cfg.device)
+        self.model = ActorCritic(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(cfg.device)
         self.optimizer = optim.Adam(self.model.parameters(),lr=cfg.lr)
     def choose_action(self, state):
         dist, value = self.model(state)
diff --git a/codes/A2C/main.py b/codes/A2C/main.py
index 08a1e1df..2bff5fc9 100644
--- a/codes/A2C/main.py
+++ b/codes/A2C/main.py
@@ -95,8 +95,8 @@ def train(cfg,env,agent):
     cfg = A2CConfig()
     env = gym.make('CartPole-v0')
     env.seed(1) # set random seed for env
-    n_states = env.observation_space.shape[0]
-    n_actions = env.action_space.n
-    agent = A2C(n_states, n_actions, cfg)
+    state_dim = env.observation_space.shape[0]
+    action_dim = env.action_space.n
+    agent = A2C(state_dim, action_dim, cfg)
     train(cfg,env,agent)
 
diff --git a/codes/A2C/model.py b/codes/A2C/model.py
index 0ceba5e6..46b59dec 100644
--- a/codes/A2C/model.py
+++ b/codes/A2C/model.py
@@ -13,18 +13,18 @@
 from torch.distributions import Categorical
 
 class ActorCritic(nn.Module):
-    def __init__(self, n_states, n_actions, hidden_dim=256):
+    def __init__(self, state_dim, action_dim, hidden_dim=256):
         super(ActorCritic, self).__init__()
         self.critic = nn.Sequential(
-            nn.Linear(n_states, hidden_dim),
+            nn.Linear(state_dim, hidden_dim),
             nn.ReLU(),
             nn.Linear(hidden_dim, 1)
         )
         
         self.actor = nn.Sequential(
-            nn.Linear(n_states, hidden_dim),
+            nn.Linear(state_dim, hidden_dim),
             nn.ReLU(),
-            nn.Linear(hidden_dim, n_actions),
+            nn.Linear(hidden_dim, action_dim),
             nn.Softmax(dim=1),
         )
         
diff --git a/codes/DDPG/agent.py b/codes/DDPG/agent.py
index 29f34d66..f2860b70 100644
--- a/codes/DDPG/agent.py
+++ b/codes/DDPG/agent.py
@@ -19,12 +19,12 @@
 
 
 class DDPG:
-    def __init__(self, n_states, n_actions, cfg):
+    def __init__(self, state_dim, action_dim, cfg):
         self.device = cfg.device
-        self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
-        self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
-        self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
-        self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
+        self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
 
         for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
             target_param.data.copy_(param.data)
diff --git a/codes/DDPG/env.py b/codes/DDPG/env.py
index ad7bd0e1..85ca81c5 100644
--- a/codes/DDPG/env.py
+++ b/codes/DDPG/env.py
@@ -41,17 +41,17 @@ def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.
         self.max_sigma    = max_sigma
         self.min_sigma    = min_sigma
         self.decay_period = decay_period
-        self.n_actions   = action_space.shape[0]
+        self.action_dim   = action_space.shape[0]
         self.low          = action_space.low
         self.high         = action_space.high
         self.reset()
         
     def reset(self):
-        self.obs = np.ones(self.n_actions) * self.mu
+        self.obs = np.ones(self.action_dim) * self.mu
         
     def evolve_obs(self):
         x  = self.obs
-        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
+        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
         self.obs = x + dx
         return self.obs
     
diff --git a/codes/DDPG/main.py b/codes/DDPG/main.py
index 5308ec60..bee9d214 100644
--- a/codes/DDPG/main.py
+++ b/codes/DDPG/main.py
@@ -82,9 +82,9 @@ def train(cfg,env,agent):
     cfg = DDPGConfig()
     env = NormalizedActions(gym.make("Pendulum-v0"))
     env.seed(1) # 设置env随机种子
-    n_states = env.observation_space.shape[0]
-    n_actions = env.action_space.shape[0]
-    agent = DDPG(n_states,n_actions,cfg)
+    state_dim = env.observation_space.shape[0]
+    action_dim = env.action_space.shape[0]
+    agent = DDPG(state_dim,action_dim,cfg)
     rewards,ma_rewards = train(cfg,env,agent)
     agent.save(path=SAVED_MODEL_PATH)
     save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
diff --git a/codes/DQN/README.md b/codes/DQN/README.md
index ca3d6a9f..9eb22469 100644
--- a/codes/DQN/README.md
+++ b/codes/DQN/README.md
@@ -46,15 +46,15 @@ import torch.nn as nn
 import torch.nn.functional as F
 
 class FCN(nn.Module):
-    def __init__(self, n_states=4, n_actions=18):
+    def __init__(self, state_dim=4, action_dim=18):
         """ 初始化q网络，为全连接网络
-            n_states: 输入的feature即环境的state数目
-            n_actions: 输出的action总个数
+            state_dim: 输入的feature即环境的state数目
+            action_dim: 输出的action总个数
         """
         super(FCN, self).__init__()
-        self.fc1 = nn.Linear(n_states, 128) # 输入层
+        self.fc1 = nn.Linear(state_dim, 128) # 输入层
         self.fc2 = nn.Linear(128, 128) # 隐藏层
-        self.fc3 = nn.Linear(128, n_actions) # 输出层
+        self.fc3 = nn.Linear(128, action_dim) # 输出层
         
     def forward(self, x):
         # 各层对应的激活函数
@@ -66,8 +66,8 @@ class FCN(nn.Module):
 
 在```agent.py```中我们定义强化学习算法，包括```choose_action```和```update```两个主要函数，初始化中：
 ```python
-self.policy_net = FCN(n_states, n_actions).to(self.device)
-self.target_net = FCN(n_states, n_actions).to(self.device)
+self.policy_net = FCN(state_dim, action_dim).to(self.device)
+self.target_net = FCN(state_dim, action_dim).to(self.device)
 # target_net的初始模型参数完全复制policy_net
 self.target_net.load_state_dict(self.policy_net.state_dict())
 self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
diff --git a/codes/DQN/agent.py b/codes/DQN/agent.py
index 299a4c9f..2b561751 100644
--- a/codes/DQN/agent.py
+++ b/codes/DQN/agent.py
@@ -20,11 +20,11 @@
 import math
 import numpy as np
 from common.memory import ReplayBuffer
-from common.model import MLP2
+from common.model import MLP
 class DQN:
-    def __init__(self, n_states, n_actions, cfg):
+    def __init__(self, state_dim, action_dim, cfg):
         
-        self.n_actions = n_actions  # 总的动作个数
+        self.action_dim = action_dim  # 总的动作个数
         self.device = cfg.device  # 设备，cpu或gpu等
         self.gamma = cfg.gamma # 奖励的折扣因子
         # e-greedy策略相关参数
@@ -34,8 +34,8 @@ def __init__(self, n_states, n_actions, cfg):
         self.epsilon_end = cfg.epsilon_end
         self.epsilon_decay = cfg.epsilon_decay
         self.batch_size = cfg.batch_size
-        self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
-        self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
         # target_net的初始模型参数完全复制policy_net
         self.target_net.load_state_dict(self.policy_net.state_dict())
         self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
@@ -64,7 +64,7 @@ def choose_action(self, state, train=True):
                     # 所以tensor.max(1)[1]返回最大值对应的下标，即action
                     action = q_value.max(1)[1].item()  
             else:
-                action = random.randrange(self.n_actions)
+                action = random.randrange(self.action_dim)
             return action
         else: 
             with torch.no_grad(): # 取消保存梯度
diff --git a/codes/DQN/main.py b/codes/DQN/main.py
index dae9c86e..a6a998e2 100644
--- a/codes/DQN/main.py
+++ b/codes/DQN/main.py
@@ -5,7 +5,7 @@
 @Email: johnjim0816@gmail.com
 @Date: 2020-06-12 00:48:57
 @LastEditor: John
-LastEditTime: 2021-03-17 20:35:37
+LastEditTime: 2021-03-26 17:17:17
 @Discription: 
 @Environment: python 3.7.7
 '''
@@ -40,7 +40,7 @@ def __init__(self):
         self.lr = 0.01 # 学习率
         self.memory_capacity = 800 # Replay Memory容量
         self.batch_size = 64
-        self.train_eps = 250 # 训练的episode数目
+        self.train_eps = 300 # 训练的episode数目
         self.train_steps = 200 # 训练每个episode的最大长度
         self.target_update = 2 # target net的更新频率
         self.eval_eps = 20 # 测试的episode数目
@@ -84,9 +84,9 @@ def train(cfg,env,agent):
     cfg = DQNConfig()
     env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym，此处一般不需要
     env.seed(1) # 设置env随机种子
-    n_states = env.observation_space.shape[0]
-    n_actions = env.action_space.n
-    agent = DQN(n_states,n_actions,cfg)
+    state_dim = env.observation_space.shape[0]
+    action_dim = env.action_space.n
+    agent = DQN(state_dim,action_dim,cfg)
     rewards,ma_rewards = train(cfg,env,agent)
     agent.save(path=SAVED_MODEL_PATH)
     save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
diff --git a/codes/DQN/results/20210326-171704/ma_rewards_train.npy b/codes/DQN/results/20210326-171704/ma_rewards_train.npy
new file mode 100644
index 00000000..2f231bb4
Binary files /dev/null and b/codes/DQN/results/20210326-171704/ma_rewards_train.npy differ
diff --git a/codes/DQN/results/20210326-171704/rewards_curve_train.png b/codes/DQN/results/20210326-171704/rewards_curve_train.png
new file mode 100644
index 00000000..0f289b23
Binary files /dev/null and b/codes/DQN/results/20210326-171704/rewards_curve_train.png differ
diff --git a/codes/DQN/results/20210326-171704/rewards_train.npy b/codes/DQN/results/20210326-171704/rewards_train.npy
new file mode 100644
index 00000000..9933915c
Binary files /dev/null and b/codes/DQN/results/20210326-171704/rewards_train.npy differ
diff --git a/codes/DQN/results/20210326-171722/ma_rewards_train.npy b/codes/DQN/results/20210326-171722/ma_rewards_train.npy
new file mode 100644
index 00000000..1d9ea32f
Binary files /dev/null and b/codes/DQN/results/20210326-171722/ma_rewards_train.npy differ
diff --git a/codes/DQN/results/20210326-171722/rewards_curve_train.png b/codes/DQN/results/20210326-171722/rewards_curve_train.png
new file mode 100644
index 00000000..e900e9c7
Binary files /dev/null and b/codes/DQN/results/20210326-171722/rewards_curve_train.png differ
diff --git a/codes/DQN/results/20210326-171722/rewards_train.npy b/codes/DQN/results/20210326-171722/rewards_train.npy
new file mode 100644
index 00000000..0351d733
Binary files /dev/null and b/codes/DQN/results/20210326-171722/rewards_train.npy differ
diff --git a/codes/DQN/saved_model/20210326-171704/dqn_checkpoint.pth b/codes/DQN/saved_model/20210326-171704/dqn_checkpoint.pth
new file mode 100644
index 00000000..567518ae
Binary files /dev/null and b/codes/DQN/saved_model/20210326-171704/dqn_checkpoint.pth differ
diff --git a/codes/DQN/saved_model/20210326-171722/dqn_checkpoint.pth b/codes/DQN/saved_model/20210326-171722/dqn_checkpoint.pth
new file mode 100644
index 00000000..b460976a
Binary files /dev/null and b/codes/DQN/saved_model/20210326-171722/dqn_checkpoint.pth differ
diff --git a/codes/DQN_cnn/README.md b/codes/DQN_cnn/README.md
new file mode 100644
index 00000000..4d1be2a8
--- /dev/null
+++ b/codes/DQN_cnn/README.md
@@ -0,0 +1,2 @@
+# DQN with cnn
+原理与[DQN](../DQN)相同，只是将神经网络换成卷积神经网络，用于二维观测信息(state或obervation)
\ No newline at end of file
diff --git a/codes/DoubleDQN/README.md b/codes/DoubleDQN/README.md
new file mode 100644
index 00000000..714bd26e
--- /dev/null
+++ b/codes/DoubleDQN/README.md
@@ -0,0 +1,39 @@
+食用本篇之前，需要有DQN算法的基础，参考[DQN算法实战](../DQN)。
+
+## 原理简介
+
+Double-DQN是2016年提出的算法，灵感源自2010年的Double-Qlearning，可参考论文[Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)。
+跟Nature DQN一样，Double-DQN也用了两个网络，一个当前网络(对应用$Q$表示)，一个目标网络(对应一般用$Q'$表示，为方便区分，以下用$Q_{tar}$代替)。我们先回忆一下，对于非终止状态，目标$Q_{tar}$值计算如下
+![在这里插入图片描述](assets/20201222145725907.png)
+
+而在Double-DQN中，不再是直接从目标$Q_{tar}$网络中选择各个动作中的最大$Q_{tar}$值，而是先从当前$Q$网络选择$Q$值最大对应的动作，然后代入到目标网络中计算对应的值：
+![在这里插入图片描述](assets/20201222150225327.png)
+Double-DQN的好处是Nature DQN中使用max虽然可以快速让Q值向可能的优化目标靠拢，但是很容易过犹不及，导致过度估计(Over Estimation)，所谓过度估计就是最终我们得到的算法模型有很大的偏差(bias)。为了解决这个问题， DDQN通过解耦目标Q值动作的选择和目标Q值的计算这两步，来达到消除过度估计的问题，感兴趣可以阅读原论文。
+
+伪代码如下：
+![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png)
+当然也可以两个网络可以同时为当前网络和目标网络，如下：
+![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png)
+或者这样更好理解如何同时为当前网络和目标网络：
+![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png)
+
+## 代码实战
+完整程序见[github](https://github.com/JohnJim0816/reinforcement-learning-tutorials/tree/master/DoubleDQN)。结合上面的原理，其实Double DQN改进来很简单，基本只需要在```update```中修改几行代码，如下：
+```python
+'''以下是Nature DQN的q_target计算方式
+next_q_state_value = self.target_net(
+next_state_batch).max(1)[0].detach()  # # 计算所有next states的Q'(s_{t+1})的最大值，Q'为目标网络的q函数,比如tensor([ 0.0060, -0.0171,...,])
+#计算 q_target
+#对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
+q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
+'''
+'''以下是Double DQNq_target计算方式，与NatureDQN稍有不同'''
+next_target_values = self.target_net(
+next_state_batch)
+#选出Q(s_t‘, a)对应的action，代入到next_target_values获得target net对应的next_q_value，即Q’(s_t|a=argmax Q(s_t‘, a))
+next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
+q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch[0])
+```
+reward变化结果如下：
+![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png)
+其中下边蓝色和红色分别表示Double DQN和Nature DQN在训练中的reward变化图，而上面蓝色和绿色则表示Double DQN和Nature DQN在测试中的reward变化图。
\ No newline at end of file
diff --git a/codes/DoubleDQN/agent.py b/codes/DoubleDQN/agent.py
index 1f9c7c16..34774c4d 100644
--- a/codes/DoubleDQN/agent.py
+++ b/codes/DoubleDQN/agent.py
@@ -5,7 +5,7 @@
 @Email: johnjim0816@gmail.com
 @Date: 2020-06-12 00:50:49
 @LastEditor: John
-LastEditTime: 2021-03-13 15:01:27
+LastEditTime: 2021-03-28 11:07:35
 @Discription: 
 @Environment: python 3.7.7
 '''
@@ -16,16 +16,15 @@
 import torch
 import torch.nn as nn
 import torch.optim as optim
-import torch.nn.functional as F
 import random
 import math
 import numpy as np
 from common.memory import ReplayBuffer
-from common.model import MLP2
+from common.model import MLP
 class DoubleDQN:
-    def __init__(self, n_states, n_actions, cfg):
+    def __init__(self, state_dim, action_dim, cfg):
         
-        self.n_actions = n_actions  # 总的动作个数
+        self.action_dim = action_dim  # 总的动作个数
         self.device = cfg.device  # 设备，cpu或gpu等
         self.gamma = cfg.gamma
         # e-greedy策略相关参数
@@ -34,8 +33,8 @@ def __init__(self, n_states, n_actions, cfg):
         self.epsilon_end = cfg.epsilon_end
         self.epsilon_decay = cfg.epsilon_decay
         self.batch_size = cfg.batch_size
-        self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
-        self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
         # target_net的初始模型参数完全复制policy_net
         self.target_net.load_state_dict(self.policy_net.state_dict())
         self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
@@ -63,7 +62,7 @@ def choose_action(self, state):
                 # 所以tensor.max(1)[1]返回最大值对应的下标，即action
                 action = q_value.max(1)[1].item()  
         else:
-            action = random.randrange(self.n_actions)
+            action = random.randrange(self.action_dim)
         return action
     def update(self):
 
diff --git a/codes/DoubleDQN/assets/20201222145725907.png b/codes/DoubleDQN/assets/20201222145725907.png
new file mode 100644
index 00000000..d2cbb2d3
Binary files /dev/null and b/codes/DoubleDQN/assets/20201222145725907.png differ
diff --git a/codes/DoubleDQN/assets/20201222150225327.png b/codes/DoubleDQN/assets/20201222150225327.png
new file mode 100644
index 00000000..20b79be3
Binary files /dev/null and b/codes/DoubleDQN/assets/20201222150225327.png differ
diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png
new file mode 100644
index 00000000..427a9034
Binary files /dev/null and b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png differ
diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png
new file mode 100644
index 00000000..d95f900b
Binary files /dev/null and b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png differ
diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png
new file mode 100644
index 00000000..ddeda962
Binary files /dev/null and b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png differ
diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png
new file mode 100644
index 00000000..dec19e50
Binary files /dev/null and b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png differ
diff --git a/codes/DoubleDQN/main.py b/codes/DoubleDQN/main.py
index 88add9af..57c9f9c4 100644
--- a/codes/DoubleDQN/main.py
+++ b/codes/DoubleDQN/main.py
@@ -5,7 +5,7 @@
 @Email: johnjim0816@gmail.com
 @Date: 2020-06-12 00:48:57
 @LastEditor: John
-LastEditTime: 2021-03-17 20:11:19
+LastEditTime: 2021-03-28 11:05:14
 @Discription: 
 @Environment: python 3.7.7
 '''
@@ -32,7 +32,7 @@
 
 class DoubleDQNConfig:
     def __init__(self):
-        self.algo = "Double DQN" # 算法名称
+        self.algo = "Double DQN" # name of algo
         self.gamma = 0.99
         self.epsilon_start = 0.9 # e-greedy策略的初始epsilon
         self.epsilon_end = 0.01
@@ -40,7 +40,7 @@ def __init__(self):
         self.lr = 0.01 # 学习率
         self.memory_capacity = 10000 # Replay Memory容量
         self.batch_size = 128
-        self.train_eps = 250 # 训练的episode数目
+        self.train_eps = 300 # 训练的episode数目
         self.train_steps = 200 # 训练每个episode的最大长度
         self.target_update = 2 # target net的更新频率
         self.eval_eps = 20 # 测试的episode数目
@@ -84,9 +84,9 @@ def train(cfg,env,agent):
     cfg = DoubleDQNConfig()
     env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym，此处一般不需要
     env.seed(1) # 设置env随机种子
-    n_states = env.observation_space.shape[0]
-    n_actions = env.action_space.n
-    agent = DoubleDQN(n_states,n_actions,cfg)
+    state_dim = env.observation_space.shape[0]
+    action_dim = env.action_space.n
+    agent = DoubleDQN(state_dim,action_dim,cfg)
     rewards,ma_rewards = train(cfg,env,agent)
     agent.save(path=SAVED_MODEL_PATH)
     save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
diff --git a/codes/DoubleDQN/results/20210328-110516/ma_rewards_train.npy b/codes/DoubleDQN/results/20210328-110516/ma_rewards_train.npy
new file mode 100644
index 00000000..1c4be2b5
Binary files /dev/null and b/codes/DoubleDQN/results/20210328-110516/ma_rewards_train.npy differ
diff --git a/codes/DoubleDQN/results/20210328-110516/rewards_curve_train.png b/codes/DoubleDQN/results/20210328-110516/rewards_curve_train.png
new file mode 100644
index 00000000..2817223d
Binary files /dev/null and b/codes/DoubleDQN/results/20210328-110516/rewards_curve_train.png differ
diff --git a/codes/DoubleDQN/results/20210328-110516/rewards_train.npy b/codes/DoubleDQN/results/20210328-110516/rewards_train.npy
new file mode 100644
index 00000000..73acfde5
Binary files /dev/null and b/codes/DoubleDQN/results/20210328-110516/rewards_train.npy differ
diff --git a/codes/DoubleDQN/saved_model/20210328-110516/DoubleDQN_checkpoint.pth b/codes/DoubleDQN/saved_model/20210328-110516/DoubleDQN_checkpoint.pth
new file mode 100644
index 00000000..69f5fce4
Binary files /dev/null and b/codes/DoubleDQN/saved_model/20210328-110516/DoubleDQN_checkpoint.pth differ
diff --git a/codes/HierarchicalDQN/agent.py b/codes/HierarchicalDQN/agent.py
new file mode 100644
index 00000000..84e79e07
--- /dev/null
+++ b/codes/HierarchicalDQN/agent.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2021-03-24 22:18:18
+LastEditor: John
+LastEditTime: 2021-03-27 04:24:30
+Discription: 
+Environment: 
+'''
+import torch
+import torch.nn as nn
+import numpy as np
+import random,math
+from HierarchicalDQN.model import MLP
+from common.memory import ReplayBuffer
+import torch.optim as optim
+class HierarchicalDQN:
+    def __init__(self,state_dim,action_dim,cfg):
+        self.action_dim = action_dim
+        self.device = cfg.device
+        self.batch_size = cfg.batch_size
+        self.sample_count = 0 
+        self.epsilon = 0
+        self.epsilon_start = cfg.epsilon_start
+        self.epsilon_end = cfg.epsilon_end
+        self.epsilon_decay = cfg.epsilon_decay
+        self.batch_size = cfg.batch_size
+        self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
+        self.target_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
+        self.meta_policy_net  = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
+        self.meta_target_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
+        self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
+        self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
+        self.memory = ReplayBuffer(cfg.memory_capacity)
+        self.meta_memory = ReplayBuffer(cfg.memory_capacity)
+    def to_onehot(x):
+        oh = np.zeros(6)
+        oh[x - 1] = 1.
+        return oh
+    def set_goal(self,meta_state):
+        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
+        self.sample_count += 1
+        if random.random() > self.epsilon:
+            with torch.no_grad():
+                meta_state = torch.tensor([meta_state], device=self.device, dtype=torch.float32)
+                q_value = self.policy_net(meta_state)
+                goal = q_value.max(1)[1].item() 
+        else:
+            goal = random.randrange(self.action_dim)
+        goal = self.meta_policy_net(meta_state)
+        onehot_goal = self.to_onehot(goal)
+        return onehot_goal
+    def choose_action(self,state):
+        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
+        self.sample_count += 1
+        if random.random() > self.epsilon:
+            with torch.no_grad():
+                state = torch.tensor([state], device=self.device, dtype=torch.float32)
+                q_value = self.policy_net(state)
+                action = q_value.max(1)[1].item()  
+        else:
+            action = random.randrange(self.action_dim)
+        return action
+    def update(self):
+        if self.batch_size > len(self.memory):
+            state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
+        state_batch = torch.tensor(
+            state_batch, device=self.device, dtype=torch.float)
+        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)  
+        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)  
+        next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
+        done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1)  
+        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch)
+        next_state_values = self.target_net(next_state_batch).max(1)[0].detach()  
+        expected_q_values = reward_batch + self.gamma * next_state_values * (1-done_batch[0])
+        loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) 
+        self.optimizer.zero_grad() 
+        loss.backward()
+        for param in self.policy_net.parameters(): 
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step() 
+
+        if self.batch_size > len(self.meta_memory):
+            meta_state_batch, meta_action_batch, meta_reward_batch, next_meta_state_batch, meta_done_batch = self.memory.sample(self.batch_size)
+        meta_state_batch = torch.tensor(meta_state_batch, device=self.device, dtype=torch.float)
+        meta_action_batch = torch.tensor(meta_action_batch, device=self.device).unsqueeze(1)  
+        meta_reward_batch = torch.tensor(meta_reward_batch, device=self.device, dtype=torch.float)  
+        next_meta_state_batch = torch.tensor(next_meta_state_batch, device=self.device, dtype=torch.float)
+        meta_done_batch = torch.tensor(np.float32(meta_done_batch), device=self.device).unsqueeze(1)  
+        meta_q_values = self.meta_policy_net(meta_state_batch).gather(dim=1, index=meta_action_batch)
+        next_state_values = self.target_net(next_meta_state_batch).max(1)[0].detach()  
+        expected_meta_q_values = meta_reward_batch + self.gamma * next_state_values * (1-meta_done_batch[0])
+        meta_loss = nn.MSEmeta_loss()(meta_q_values, expected_meta_q_values.unsqueeze(1)) 
+        self.meta_optimizer.zero_grad() 
+        meta_loss.backward()
+        for param in self.meta_policy_net.parameters(): 
+            param.grad.data.clamp_(-1, 1)
+        self.meta_optimizer.step() 
+        
+        
\ No newline at end of file
diff --git a/codes/HierarchicalDQN/main.py b/codes/HierarchicalDQN/main.py
new file mode 100644
index 00000000..5ecd02f6
--- /dev/null
+++ b/codes/HierarchicalDQN/main.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2021-03-24 22:14:04
+LastEditor: John
+LastEditTime: 2021-03-27 04:23:43
+Discription: 
+Environment: 
+'''
+import sys,os
+sys.path.append(os.getcwd()) # add current terminal path to sys.path
+import gym
+import numpy as np
+import torch
+import datetime
+from HierarchicalDQN.agent import HierarchicalDQN
+from common.plot import plot_rewards
+from common.utils import save_results
+
+SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
+SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'  # path to save model
+if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): 
+    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
+if not os.path.exists(SAVED_MODEL_PATH):
+    os.mkdir(SAVED_MODEL_PATH)
+RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards
+if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): 
+    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
+if not os.path.exists(RESULT_PATH): 
+    os.mkdir(RESULT_PATH)
+
+class HierarchicalDQNConfig:
+    def __init__(self):
+        self.algo = "DQN" # name of algo
+        self.gamma = 0.99
+        self.epsilon_start = 0.95 # start epsilon of e-greedy policy
+        self.epsilon_end = 0.01
+        self.epsilon_decay = 200
+        self.lr = 0.01 # learning rate
+        self.memory_capacity = 800 # Replay Memory capacity
+        self.batch_size = 64
+        self.train_eps = 250 # 训练的episode数目
+        self.train_steps = 200 # 训练每个episode的最大长度
+        self.target_update = 2 # target net的更新频率
+        self.eval_eps = 20 # 测试的episode数目
+        self.eval_steps = 200 # 测试每个episode的最大长度
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
+        self.hidden_dim = 256 # dimension of hidden layer
+
+def train(cfg,env,agent):
+    print('Start to train !')
+    rewards = []
+    ma_rewards = [] # moving average reward
+    ep_steps = []
+    for i_episode in range(cfg.train_eps):
+        state = env.reset() 
+        extrinsic_reward = 0
+        for i_step in range(cfg.train_steps):
+            goal= agent.set_goal(state)
+            meta_state = state
+            goal_state  = np.concatenate([state, goal])
+            action = agent.choose_action(state) 
+            next_state, reward, done, _ = env.step(action)
+            extrinsic_reward += reward
+            intrinsic_reward = 1.0 if goal == np.argmax(next_state) else 0.0
+            agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, goal]), done)
+            state = next_state 
+            agent.update()
+            if done:
+                break
+        if i_episode % cfg.target_update == 0:
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,extrinsic_reward,i_step+1,done))
+        ep_steps.append(i_step)
+        rewards.append(extrinsic_reward)
+        if ma_rewards:
+            ma_rewards.append(
+                0.9*ma_rewards[-1]+0.1*extrinsic_reward)
+        else:
+            ma_rewards.append(extrinsic_reward)   
+    agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
+    print('Complete training！')
+    return rewards,ma_rewards
+
+if __name__ == "__main__":
+    cfg = HierarchicalDQNConfig()
+    env = gym.make('CartPole-v0')
+    env.seed(1) 
+    state_dim = env.observation_space.shape[0]
+    action_dim = env.action_space.n
+    agent = HierarchicalDQN(state_dim,action_dim,cfg)
+    rewards,ma_rewards = train(cfg,env,agent)
+    agent.save(path=SAVED_MODEL_PATH)
+    save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
+    plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH)
\ No newline at end of file
diff --git a/codes/HierarchicalDQN/model.py b/codes/HierarchicalDQN/model.py
new file mode 100644
index 00000000..0bf05842
--- /dev/null
+++ b/codes/HierarchicalDQN/model.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2021-03-24 22:14:12
+LastEditor: John
+LastEditTime: 2021-03-24 22:17:09
+Discription: 
+Environment: 
+'''
+import torch.nn as nn
+import torch.nn.functional as F
+class MLP(nn.Module):
+    def __init__(self, state_dim,action_dim,hidden_dim=128):
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(state_dim, hidden_dim) 
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
+        self.fc3 = nn.Linear(hidden_dim, action_dim) 
+        
+    def forward(self, x):
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)
\ No newline at end of file
diff --git a/codes/MonteCarlo/agent.py b/codes/MonteCarlo/agent.py
index 1484049d..3ec4d7a9 100644
--- a/codes/MonteCarlo/agent.py
+++ b/codes/MonteCarlo/agent.py
@@ -16,11 +16,11 @@
 class FisrtVisitMC:
     ''' On-Policy First-Visit MC Control
     '''
-    def __init__(self,n_actions,cfg):
-        self.n_actions = n_actions
+    def __init__(self,action_dim,cfg):
+        self.action_dim = action_dim
         self.epsilon = cfg.epsilon
         self.gamma = cfg.gamma 
-        self.Q = defaultdict(lambda: np.zeros(n_actions))
+        self.Q = defaultdict(lambda: np.zeros(action_dim))
         self.returns_sum = defaultdict(float) # sum of returns
         self.returns_count = defaultdict(float)
         
@@ -28,11 +28,11 @@ def choose_action(self,state):
         ''' e-greed policy '''
         if state in self.Q.keys():
             best_action = np.argmax(self.Q[state])
-            action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
+            action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
             action_probs[best_action] += (1.0 - self.epsilon)
             action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
         else:
-            action = np.random.randint(0,self.n_actions)
+            action = np.random.randint(0,self.action_dim)
         return action
     def update(self,one_ep_transition):
         # Find all (state, action) pairs we've visited in this one_ep_transition
diff --git a/codes/MonteCarlo/main.py b/codes/MonteCarlo/main.py
index bdd5ca40..c9844751 100644
--- a/codes/MonteCarlo/main.py
+++ b/codes/MonteCarlo/main.py
@@ -79,8 +79,8 @@ def mc_train(cfg,env,agent):
 if __name__ == "__main__":
     mc_cfg = MCConfig()
     env = RacetrackEnv()
-    n_actions=9
-    agent = FisrtVisitMC(n_actions,mc_cfg)
+    action_dim=9
+    agent = FisrtVisitMC(action_dim,mc_cfg)
     rewards,ma_rewards= mc_train(mc_cfg,env,agent)
     save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
     plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC Control",path=RESULT_PATH)
diff --git a/codes/PolicyGradient/agent.py b/codes/PolicyGradient/agent.py
index fdc805ca..997f4aed 100644
--- a/codes/PolicyGradient/agent.py
+++ b/codes/PolicyGradient/agent.py
@@ -17,9 +17,9 @@
 
 class PolicyGradient:
     
-    def __init__(self, n_states,cfg):
+    def __init__(self, state_dim,cfg):
         self.gamma = cfg.gamma
-        self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
+        self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim)
         self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
         self.batch_size = cfg.batch_size
 
diff --git a/codes/PolicyGradient/main.py b/codes/PolicyGradient/main.py
index a4c2a08b..a35be7a9 100644
--- a/codes/PolicyGradient/main.py
+++ b/codes/PolicyGradient/main.py
@@ -80,9 +80,9 @@ def train(cfg,env,agent):
     cfg = PGConfig()
     env = gym.make('CartPole-v0') # 可google为什么unwrapped gym，此处一般不需要
     env.seed(1) # 设置env随机种子
-    n_states = env.observation_space.shape[0]
-    n_actions = env.action_space.n
-    agent  = PolicyGradient(n_states,cfg)
+    state_dim = env.observation_space.shape[0]
+    action_dim = env.action_space.n
+    agent  = PolicyGradient(state_dim,cfg)
     rewards, ma_rewards = train(cfg,env,agent)
     agent.save_model(SAVED_MODEL_PATH)
     save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
diff --git a/codes/PolicyGradient/model.py b/codes/PolicyGradient/model.py
index 799affad..7f5b1a85 100644
--- a/codes/PolicyGradient/model.py
+++ b/codes/PolicyGradient/model.py
@@ -16,10 +16,10 @@ class MLP(nn.Module):
         输入：state维度
         输出：概率
     '''
-    def __init__(self,n_states,hidden_dim = 36):
+    def __init__(self,state_dim,hidden_dim = 36):
         super(MLP, self).__init__()
-        # 24和36为hidden layer的层数，可根据state_dim, n_actions的情况来改变
-        self.fc1 = nn.Linear(n_states, hidden_dim)
+        # 24和36为hidden layer的层数，可根据state_dim, action_dim的情况来改变
+        self.fc1 = nn.Linear(state_dim, hidden_dim)
         self.fc2 = nn.Linear(hidden_dim,hidden_dim)
         self.fc3 = nn.Linear(hidden_dim, 1)  # Prob of Left
 
diff --git a/codes/QLearning/agent.py b/codes/QLearning/agent.py
index f4a793a4..2d2cb974 100644
--- a/codes/QLearning/agent.py
+++ b/codes/QLearning/agent.py
@@ -5,7 +5,7 @@
 Email: johnjim0816@gmail.com
 Date: 2020-09-11 23:03:00
 LastEditor: John
-LastEditTime: 2021-03-12 16:48:25
+LastEditTime: 2021-03-26 16:51:01
 Discription: 
 Environment: 
 '''
@@ -16,39 +16,35 @@
 
 class QLearning(object):
     def __init__(self,
-                 n_actions,cfg):
-        self.n_actions = n_actions  # number of actions
+                 action_dim,cfg):
+        self.action_dim = action_dim  # dimension of acgtion
         self.lr = cfg.lr  # learning rate
         self.gamma = cfg.gamma  
         self.epsilon = 0 
-        self.sample_count = 0  # epsilon随训练的也就是采样次数逐渐衰减，所以需要计数
+        self.sample_count = 0  
         self.epsilon_start = cfg.epsilon_start
         self.epsilon_end = cfg.epsilon_end
         self.epsilon_decay = cfg.epsilon_decay
-        self.Q_table  = defaultdict(lambda: np.zeros(n_actions)) # 使用字典存储Q表，个人比较喜欢这种，也可以用下面一行的二维数组表示，但是需要额外更改代码
-        # self.Q_table = np.zeros((n_states, n_actions))  # Q表
+        self.Q_table  = defaultdict(lambda: np.zeros(action_dim)) # A nested dictionary that maps state -> (action -> action-value)
     def choose_action(self, state):
         self.sample_count += 1
         self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
             math.exp(-1. * self.sample_count / self.epsilon_decay)
-        # 随机选取0-1之间的值，如果大于epsilon就按照贪心策略选取action，否则随机选取
+        # e-greedy policy
         if np.random.uniform(0, 1) > self.epsilon:
-            action = np.argmax(self.Q_table[state])
+            action = np.argmax(self.Q_table[str(state)])
         else:
-            action = np.random.choice(self.n_actions)  # 有一定概率随机探索选取一个动作
+            action = np.random.choice(self.action_dim) 
         return action
             
     def update(self, state, action, reward, next_state, done):
-        Q_predict = self.Q_table[state][action]
+        Q_predict = self.Q_table[str(state)][action]
         if done:
             Q_target = reward  # terminal state
         else:
-            Q_target = reward + self.gamma * np.max(
-                self.Q_table[next_state])  # Q_table-learning
-        self.Q_table[state][action] += self.lr * (Q_target - Q_predict)
+            Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)]) 
+        self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)
     def save(self,path):
-        '''把 Q表格 的数据保存到文件中
-        '''
         import dill
         torch.save(
             obj=self.Q_table,
@@ -56,7 +52,5 @@ def save(self,path):
             pickle_module=dill
         )
     def load(self, path):
-        '''从文件中读取数据到 Q表格
-        '''
         import dill
         self.Q_table =torch.load(f=path+'Qleaning_model.pkl',pickle_module=dill)
\ No newline at end of file
diff --git a/codes/QLearning/main.py b/codes/QLearning/main.py
index 27a0934f..bf03ce9c 100644
--- a/codes/QLearning/main.py
+++ b/codes/QLearning/main.py
@@ -5,7 +5,7 @@
 Email: johnjim0816@gmail.com
 Date: 2020-09-11 23:03:00
 LastEditor: John
-LastEditTime: 2021-03-12 21:16:50
+LastEditTime: 2021-03-26 17:16:07
 Discription: 
 Environment: 
 '''
@@ -35,20 +35,18 @@
 class QlearningConfig:
     '''训练相关参数'''
     def __init__(self):
-        self.n_episodes = 200 # 训练的episode数目
+        self.train_eps = 200 # 训练的episode数目
         self.gamma = 0.9 # reward的衰减率
         self.epsilon_start = 0.99 # e-greedy策略中初始epsilon
         self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
         self.epsilon_decay = 200 # e-greedy策略中epsilon的衰减率
-        self.lr = 0.1 # 学习率
+        self.lr = 0.1 # learning rate
 
 def train(cfg,env,agent):
-    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
-    # env = FrozenLakeWapper(env)
-    rewards = []  # 记录所有episode的reward
-    ma_rewards = [] # 滑动平均的reward
+    rewards = []  
+    ma_rewards = [] # moving average reward
     steps = []  # 记录所有episode的steps
-    for i_episode in range(cfg.n_episodes):
+    for i_episode in range(cfg.train_eps):
         ep_reward = 0  # 记录每个episode的reward
         ep_steps = 0  # 记录每个episode走了多少step
         state = env.reset()  # 重置环境, 重新开一局（即开始新的一个episode）
@@ -63,12 +61,11 @@ def train(cfg,env,agent):
                 break
         steps.append(ep_steps)
         rewards.append(ep_reward)
-        # 计算滑动平均的reward
         if ma_rewards:
             ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
         else:
             ma_rewards.append(ep_reward)
-        print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.n_episodes,ep_reward))
+        print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.train_eps,ep_reward))
     return rewards,ma_rewards
 
 def eval(cfg,env,agent):
@@ -77,7 +74,7 @@ def eval(cfg,env,agent):
     rewards = []  # 记录所有episode的reward
     ma_rewards = [] # 滑动平均的reward
     steps = []  # 记录所有episode的steps
-    for i_episode in range(cfg.n_episodes):
+    for i_episode in range(cfg.train_eps):
         ep_reward = 0  # 记录每个episode的reward
         ep_steps = 0  # 记录每个episode走了多少step
         state = env.reset()  # 重置环境, 重新开一局（即开始新的一个episode）
@@ -96,15 +93,15 @@ def eval(cfg,env,agent):
             ma_rewards.append(rewards[-1]*0.9+ep_reward*0.1)
         else:
             ma_rewards.append(ep_reward)
-        print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.n_episodes,ep_reward))
+        print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.train_eps,ep_reward))
     return rewards,ma_rewards
     
 if __name__ == "__main__":
     cfg = QlearningConfig()
     env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
     env = CliffWalkingWapper(env)
-    n_actions = env.action_space.n
-    agent = QLearning(n_actions,cfg)
+    action_dim = env.action_space.n
+    agent = QLearning(action_dim,cfg)
     rewards,ma_rewards = train(cfg,env,agent)
     agent.save(path=SAVED_MODEL_PATH)
     save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
diff --git a/codes/QLearning/results/20210326-171621/ma_rewards_train.npy b/codes/QLearning/results/20210326-171621/ma_rewards_train.npy
new file mode 100644
index 00000000..0f842f2b
Binary files /dev/null and b/codes/QLearning/results/20210326-171621/ma_rewards_train.npy differ
diff --git a/codes/QLearning/results/20210326-171621/rewards_curve_train.png b/codes/QLearning/results/20210326-171621/rewards_curve_train.png
new file mode 100644
index 00000000..985b8c7b
Binary files /dev/null and b/codes/QLearning/results/20210326-171621/rewards_curve_train.png differ
diff --git a/codes/QLearning/results/20210326-171621/rewards_train.npy b/codes/QLearning/results/20210326-171621/rewards_train.npy
new file mode 100644
index 00000000..ed8f5240
Binary files /dev/null and b/codes/QLearning/results/20210326-171621/rewards_train.npy differ
diff --git a/codes/QLearning/saved_model/20210326-171621/Qleaning_model.pkl b/codes/QLearning/saved_model/20210326-171621/Qleaning_model.pkl
new file mode 100644
index 00000000..47e72796
Binary files /dev/null and b/codes/QLearning/saved_model/20210326-171621/Qleaning_model.pkl differ
diff --git a/codes/README_en.md b/codes/README_en.md
new file mode 100644
index 00000000..c931b6a2
--- /dev/null
+++ b/codes/README_en.md
@@ -0,0 +1,57 @@
+
+
+[Eng](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README_en.md)|[中文](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README.md)
+
+## Introduction
+
+This repo is used to learn basic RL algorithms, we will make it **detailed comment** and **clear structure** as much as possible:
+
+The code structure mainly contains several scripts as following：
+
+* ```model.py``` basic network model of RL, like MLP, CNN
+* ```memory.py``` Replay Buffer
+* ```plot.py``` use seaborn to plot rewards curve，saved in folder ``` result```.
+* ```env.py``` to custom or normalize environments
+* ```agent.py``` core algorithms, include a python Class with functions(choose action, update)
+* ```main.py``` main function
+
+
+
+Note that ```model.py```,```memory.py```,```plot.py``` shall be utilized in different algorithms，thus they are put into ```common``` folder。
+
+## Runnig Environment
+
+python 3.7.9、pytorch 1.6.0、gym 0.18.0
+## Usage
+
+Environment infomations see [环境说明](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/env_info.md)
+
+## Schedule
+
+|                             Name                             |                      Related materials                      | Used Envs                                                    |                            Notes                             |
+| :----------------------------------------------------------: | :---------------------------------------------------------: | ------------------------------------------------------------ | :----------------------------------------------------------: |
+| [On-Policy First-Visit MC](./MonteCarlo) |                                                             | [Racetrack](./envs/racetrack_env.md) |                                                              |
+| [Q-Learning](./QLearning) |                                                             | [CliffWalking-v0](./envs/gym_info.md) |                                                              |
+| [Sarsa](./Sarsa) |                                                             | [Racetrack](./envs/racetrack_env.md) |                                                              |
+| [DQN](./DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) |  |
+|                           [DQN-cnn](./DQN_cnn)                            | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) |                            |
+| [DoubleDQN](./DoubleDQN) |                                                             | [CartPole-v0](./envs/gym_info.md) |                       not well                       |
+|                       Hierarchical DQN                       |    [Hierarchical DQN](https://arxiv.org/abs/1604.06057)     |                                                              |                                                              |
+| [PolicyGradient](./PolicyGradient) |                                                             | [CartPole-v0](./envs/gym_info.md) |                                                              |
+|                             A2C                              |                                                             | [CartPole-v0](./envs/gym_info.md) |                                                              |
+|                             A3C                              |                                                             |                                                              |                                                              |
+|                             SAC                              |                                                             |                                                              |                                                              |
+| [PPO](./PPO) |        [PPO paper](https://arxiv.org/abs/1707.06347)        | [CartPole-v0](./envs/gym_info.md) |  |
+|                             DDPG                             |       [DDPG Paper](https://arxiv.org/abs/1509.02971)        | [Pendulum-v0](./envs/gym_info.md) |                                                              |
+|                             TD3                              | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) |                                                              |                                                              |
+|                             GAIL                             |                                                             |                                                              |                                                              |
+
+
+## Refs
+
+
+[RL-Adventure-2](https://github.com/higgsfield/RL-Adventure-2)
+
+[RL-Adventure](https://github.com/higgsfield/RL-Adventure)
+
+https://www.cnblogs.com/lucifer1997/p/13458563.html
diff --git a/codes/Sarsa/agent.py b/codes/Sarsa/agent.py
index 37533812..020f6da8 100644
--- a/codes/Sarsa/agent.py
+++ b/codes/Sarsa/agent.py
@@ -14,17 +14,17 @@
 import torch
 class Sarsa(object):
     def __init__(self,
-                 n_actions,sarsa_cfg,):
-        self.n_actions = n_actions  # number of actions
+                 action_dim,sarsa_cfg,):
+        self.action_dim = action_dim  # number of actions
         self.lr = sarsa_cfg.lr  # learning rate
         self.gamma = sarsa_cfg.gamma  
         self.epsilon = sarsa_cfg.epsilon  
-        self.Q  = defaultdict(lambda: np.zeros(n_actions))
-        # self.Q = np.zeros((n_states, n_actions))  # Q表
+        self.Q  = defaultdict(lambda: np.zeros(action_dim))
+        # self.Q = np.zeros((state_dim, action_dim))  # Q表
     def choose_action(self, state):
         best_action = np.argmax(self.Q[state])
         # action = best_action
-        action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
+        action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
         action_probs[best_action] += (1.0 - self.epsilon)
         action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 
         return action
diff --git a/codes/Sarsa/main.py b/codes/Sarsa/main.py
index 0bc976f5..a2363edb 100644
--- a/codes/Sarsa/main.py
+++ b/codes/Sarsa/main.py
@@ -70,8 +70,8 @@ def sarsa_train(cfg,env,agent):
 if __name__ == "__main__":
     sarsa_cfg = SarsaConfig()
     env = RacetrackEnv()
-    n_actions=9
-    agent = Sarsa(n_actions,sarsa_cfg)
+    action_dim=9
+    agent = Sarsa(action_dim,sarsa_cfg)
     rewards,ma_rewards = sarsa_train(sarsa_cfg,env,agent)
     agent.save(path=SAVED_MODEL_PATH)
     save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
diff --git a/codes/common/model.py b/codes/common/model.py
index 008c39c1..e02e3c19 100644
--- a/codes/common/model.py
+++ b/codes/common/model.py
@@ -5,7 +5,7 @@
 Email: johnjim0816@gmail.com
 Date: 2021-03-12 21:14:12
 LastEditor: John
-LastEditTime: 2021-03-23 16:35:46
+LastEditTime: 2021-03-24 22:15:00
 Discription: 
 Environment: 
 '''
@@ -14,16 +14,16 @@
 import torch.nn.functional as F
 from torch.distributions import Categorical
 
-class MLP2(nn.Module):
-    def __init__(self, n_states,n_actions,hidden_dim=128):
+class MLP(nn.Module):
+    def __init__(self, state_dim,action_dim,hidden_dim=128):
         """ 初始化q网络，为全连接网络
-            n_states: 输入的feature即环境的state数目
-            n_actions: 输出的action总个数
+            state_dim: 输入的feature即环境的state数目
+            action_dim: 输出的action总个数
         """
-        super(MLP2, self).__init__()
-        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
         self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
-        self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
+        self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
         
     def forward(self, x):
         # 各层对应的激活函数
@@ -32,10 +32,10 @@ def forward(self, x):
         return self.fc3(x)
 
 class Critic(nn.Module):
-    def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
+    def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
         super(Critic, self).__init__()
         
-        self.linear1 = nn.Linear(n_obs + n_actions, hidden_size)
+        self.linear1 = nn.Linear(n_obs + action_dim, hidden_size)
         self.linear2 = nn.Linear(hidden_size, hidden_size)
         self.linear3 = nn.Linear(hidden_size, 1)
         # 随机初始化为较小的值
@@ -51,11 +51,11 @@ def forward(self, state, action):
         return x
 
 class Actor(nn.Module):
-    def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
+    def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
         super(Actor, self).__init__()  
         self.linear1 = nn.Linear(n_obs, hidden_size)
         self.linear2 = nn.Linear(hidden_size, hidden_size)
-        self.linear3 = nn.Linear(hidden_size, n_actions)
+        self.linear3 = nn.Linear(hidden_size, action_dim)
         
         self.linear3.weight.data.uniform_(-init_w, init_w)
         self.linear3.bias.data.uniform_(-init_w, init_w)
@@ -67,18 +67,18 @@ def forward(self, x):
         return x
 
 class ActorCritic(nn.Module):
-    def __init__(self, n_states, n_actions, hidden_dim=256):
+    def __init__(self, state_dim, action_dim, hidden_dim=256):
         super(ActorCritic, self).__init__()
         self.critic = nn.Sequential(
-            nn.Linear(n_states, hidden_dim),
+            nn.Linear(state_dim, hidden_dim),
             nn.ReLU(),
             nn.Linear(hidden_dim, 1)
         )
         
         self.actor = nn.Sequential(
-            nn.Linear(n_states, hidden_dim),
+            nn.Linear(state_dim, hidden_dim),
             nn.ReLU(),
-            nn.Linear(hidden_dim, n_actions),
+            nn.Linear(hidden_dim, action_dim),
             nn.Softmax(dim=1),
         )
         
diff --git a/codes/envs/blackjack.py b/codes/envs/blackjack.py
index 87f02d2d..69468952 100644
--- a/codes/envs/blackjack.py
+++ b/codes/envs/blackjack.py
@@ -77,7 +77,7 @@ def __init__(self, natural=False):
         self.natural = natural
         # Start the first game
         self._reset()        # Number of 
-        self.n_actions = 2
+        self.action_dim = 2
 
     def reset(self):
         return self._reset()
diff --git a/codes/envs/cliff_walking.py b/codes/envs/cliff_walking.py
index 05b9b2ee..73e33c73 100644
--- a/codes/envs/cliff_walking.py
+++ b/codes/envs/cliff_walking.py
@@ -31,7 +31,7 @@ def __init__(self):
         self.shape = (4, 12)
 
         nS = np.prod(self.shape)
-        n_actions = 4
+        action_dim = 4
 
         # Cliff Location
         self._cliff = np.zeros(self.shape, dtype=np.bool)
@@ -41,7 +41,7 @@ def __init__(self):
         P = {}
         for s in range(nS):
             position = np.unravel_index(s, self.shape)
-            P[s] = { a : [] for a in range(n_actions) }
+            P[s] = { a : [] for a in range(action_dim) }
             P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
             P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
             P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
@@ -51,7 +51,7 @@ def __init__(self):
         isd = np.zeros(nS)
         isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
 
-        super(CliffWalkingEnv, self).__init__(nS, n_actions, P, isd)
+        super(CliffWalkingEnv, self).__init__(nS, action_dim, P, isd)
 
     def render(self, mode='human', close=False):
         self._render(mode, close)
diff --git a/codes/envs/gridworld.py b/codes/envs/gridworld.py
index cf3aec29..c4fd512d 100644
--- a/codes/envs/gridworld.py
+++ b/codes/envs/gridworld.py
@@ -37,7 +37,7 @@ def __init__(self, shape=[4,4]):
         self.shape = shape
 
         nS = np.prod(shape)
-        n_actions = 4
+        action_dim = 4
 
         MAX_Y = shape[0]
         MAX_X = shape[1]
@@ -51,7 +51,7 @@ def __init__(self, shape=[4,4]):
             y, x = it.multi_index
 
             # P[s][a] = (prob, next_state, reward, is_done)
-            P[s] = {a : [] for a in range(n_actions)}
+            P[s] = {a : [] for a in range(action_dim)}
 
             is_done = lambda s: s == 0 or s == (nS - 1)
             reward = 0.0 if is_done(s) else -1.0
@@ -82,7 +82,7 @@ def __init__(self, shape=[4,4]):
         # This should not be used in any model-free learning algorithm
         self.P = P
 
-        super(GridworldEnv, self).__init__(nS, n_actions, P, isd)
+        super(GridworldEnv, self).__init__(nS, action_dim, P, isd)
 
     def _render(self, mode='human', close=False):
         """ Renders the current gridworld layout
diff --git a/codes/envs/stochastic_mdp.py b/codes/envs/stochastic_mdp.py
new file mode 100644
index 00000000..5770fa5d
--- /dev/null
+++ b/codes/envs/stochastic_mdp.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2021-03-24 22:12:19
+LastEditor: John
+LastEditTime: 2021-03-26 17:12:43
+Discription: 
+Environment: 
+'''
+import numpy as np
+import random
+
+
+class StochasticMDP:
+    def __init__(self):
+        self.end = False
+        self.curr_state = 2
+        self.action_dim = 2
+        self.state_dim = 6
+        self.p_right = 0.5
+
+    def reset(self):
+        self.end = False
+        self.curr_state = 2
+        state = np.zeros(self.state_dim)
+        state[self.curr_state - 1] = 1.
+        return state
+
+    def step(self, action):
+        if self.curr_state != 1:
+            if action == 1:
+                if random.random() < self.p_right and self.curr_state < self.state_dim:
+                    self.curr_state += 1
+                else:
+                    self.curr_state -= 1
+
+            if action == 0:
+                self.curr_state -= 1
+        if self.curr_state == self.state_dim:
+            self.end = True
+
+        state = np.zeros(self.state_dim)
+        state[self.curr_state - 1] = 1.
+
+        if self.curr_state == 1:
+            if self.end:
+                return state, 1.00, True, {}
+            else:
+                return state, 1.00/100.00, True, {}
+        else:
+            return state, 0.0, False, {}
diff --git a/codes/envs/windy_gridworld.py b/codes/envs/windy_gridworld.py
index 2a9d4a47..ac9c66ad 100644
--- a/codes/envs/windy_gridworld.py
+++ b/codes/envs/windy_gridworld.py
@@ -30,7 +30,7 @@ def __init__(self):
         self.shape = (7, 10)
 
         nS = np.prod(self.shape)
-        n_actions = 4
+        action_dim = 4
 
         # Wind strength
         winds = np.zeros(self.shape)
@@ -41,7 +41,7 @@ def __init__(self):
         P = {}
         for s in range(nS):
             position = np.unravel_index(s, self.shape)
-            P[s] = { a : [] for a in range(n_actions) }
+            P[s] = { a : [] for a in range(action_dim) }
             P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds)
             P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds)
             P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds)
@@ -51,7 +51,7 @@ def __init__(self):
         isd = np.zeros(nS)
         isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
 
-        super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd)
+        super(WindyGridworldEnv, self).__init__(nS, action_dim, P, isd)
 
     def render(self, mode='human', close=False):
         self._render(mode, close)
diff --git a/docs/README.md b/docs/README.md
index b4a4ce9c..d62260a5 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -30,23 +30,7 @@
 | [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) |                                                              |                                                              |
 ## 算法实战
 
-|                 算法名称                 |                        相关论文材料                         | 环境                                  |                备注                |
-| :--------------------------------------: | :---------------------------------------------------------: | ------------------------------------- | :--------------------------------: |
-| [On-Policy First-Visit MC](../codes/MonteCarlo) |                                                             | [Racetrack](../codes/envs/racetrack_env.md)  |                                    |
-|        [Q-Learning](../codes/QLearning)         |                                                             | [CliffWalking-v0](../codes/envs/gym_info.md) |                                    |
-|             [Sarsa](../codes/Sarsa)             |                                                             | [Racetrack](../codes/envs/racetrack_env.md)  |                                    |
-|               [DQN](../codes/DQN)               | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](../codes/envs/gym_info.md)     |                                    |
-|                 DQN-cnn                  | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](../codes/envs/gym_info.md)     | 与DQN相比使用了CNN而不是全链接网络 |
-|         [DoubleDQN](../codes/DoubleDQN)         |                                                             | [CartPole-v0](../codes/envs/gym_info.md)     |          效果不好，待改进          |
-|             Hierarchical DQN             |    [Hierarchical DQN](https://arxiv.org/abs/1604.06057)     |                                       |                                    |
-|    [PolicyGradient](../codes/PolicyGradient)    |                                                             | [CartPole-v0](../codes/envs/gym_info.md)     |                                    |
-|                   A2C                    |                                                             | [CartPole-v0](../codes/envs/gym_info.md)     |                                    |
-|                   A3C                    |                                                             |                                       |                                    |
-|                   SAC                    |                                                             |                                       |                                    |
-|               [PPO](../codes/PPO)               |        [PPO paper](https://arxiv.org/abs/1707.06347)        | [CartPole-v0](../codes/envs/gym_info.md)     |                                    |
-|                   DDPG                   |       [DDPG Paper](https://arxiv.org/abs/1509.02971)        | [Pendulum-v0](../codes/envs/gym_info.md)     |                                    |
-|                   TD3                    | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) |                                       |                                    |
-|                   GAIL                   |                                                             |                                       |                                    |
+[点击](../codes)或者跳转```codes```文件夹下进入算法实战
 
 ## 贡献者