Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
johnjim0816 committed Mar 28, 2021
1 parent 2df8d96 commit 6e4d966
Show file tree
Hide file tree
Showing 56 changed files with 497 additions and 165 deletions.
18 changes: 1 addition & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,7 @@
| [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) | | |
## 算法实战

| 算法名称 | 相关论文材料 | 环境 | 备注 |
| :--------------------------------------: | :---------------------------------------------------------: | ------------------------------------- | :--------------------------------: |
| [On-Policy First-Visit MC](./codes/MonteCarlo) | | [Racetrack](./codes/envs/racetrack_env.md) | |
| [Q-Learning](./codes/QLearning) | | [CliffWalking-v0](./codes/envs/gym_info.md) | |
| [Sarsa](./codes/Sarsa) | | [Racetrack](./codes/envs/racetrack_env.md) | |
| [DQN](./codes/DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./codes/envs/gym_info.md) | |
| DQN-cnn | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./codes/envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 |
| [DoubleDQN](./codes/DoubleDQN) | | [CartPole-v0](./codes/envs/gym_info.md) | 效果不好,待改进 |
| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | |
| [PolicyGradient](./codes/PolicyGradient) | | [CartPole-v0](./codes/envs/gym_info.md) | |
| A2C | | [CartPole-v0](./codes/envs/gym_info.md) | |
| A3C | | | |
| SAC | | | |
| [PPO](./codes/PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./codes/envs/gym_info.md) | |
| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./codes/envs/gym_info.md) | |
| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | |
| GAIL | | | |
[点击](./codes)或者跳转```codes```文件夹下进入算法实战

## 贡献者

Expand Down
4 changes: 2 additions & 2 deletions codes/A2C/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
import torch.optim as optim

class A2C:
def __init__(self,n_states, n_actions, cfg):
def __init__(self,state_dim, action_dim, cfg):
self.gamma = 0.99
self.model = ActorCritic(n_states, n_actions, hidden_dim=cfg.hidden_dim).to(cfg.device)
self.model = ActorCritic(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(cfg.device)
self.optimizer = optim.Adam(self.model.parameters(),lr=cfg.lr)
def choose_action(self, state):
dist, value = self.model(state)
Expand Down
6 changes: 3 additions & 3 deletions codes/A2C/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ def train(cfg,env,agent):
cfg = A2CConfig()
env = gym.make('CartPole-v0')
env.seed(1) # set random seed for env
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = A2C(n_states, n_actions, cfg)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = A2C(state_dim, action_dim, cfg)
train(cfg,env,agent)

8 changes: 4 additions & 4 deletions codes/A2C/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@
from torch.distributions import Categorical

class ActorCritic(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim=256):
def __init__(self, state_dim, action_dim, hidden_dim=256):
super(ActorCritic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(n_states, hidden_dim),
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)

self.actor = nn.Sequential(
nn.Linear(n_states, hidden_dim),
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, n_actions),
nn.Linear(hidden_dim, action_dim),
nn.Softmax(dim=1),
)

Expand Down
10 changes: 5 additions & 5 deletions codes/DDPG/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@


class DDPG:
def __init__(self, n_states, n_actions, cfg):
def __init__(self, state_dim, action_dim, cfg):
self.device = cfg.device
self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)

for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
target_param.data.copy_(param.data)
Expand Down
6 changes: 3 additions & 3 deletions codes/DDPG/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,17 @@ def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.
self.max_sigma = max_sigma
self.min_sigma = min_sigma
self.decay_period = decay_period
self.n_actions = action_space.shape[0]
self.action_dim = action_space.shape[0]
self.low = action_space.low
self.high = action_space.high
self.reset()

def reset(self):
self.obs = np.ones(self.n_actions) * self.mu
self.obs = np.ones(self.action_dim) * self.mu

def evolve_obs(self):
x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
self.obs = x + dx
return self.obs

Expand Down
6 changes: 3 additions & 3 deletions codes/DDPG/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ def train(cfg,env,agent):
cfg = DDPGConfig()
env = NormalizedActions(gym.make("Pendulum-v0"))
env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = DDPG(n_states,n_actions,cfg)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
agent = DDPG(state_dim,action_dim,cfg)
rewards,ma_rewards = train(cfg,env,agent)
agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
Expand Down
14 changes: 7 additions & 7 deletions codes/DQN/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@ import torch.nn as nn
import torch.nn.functional as F

class FCN(nn.Module):
def __init__(self, n_states=4, n_actions=18):
def __init__(self, state_dim=4, action_dim=18):
""" 初始化q网络,为全连接网络
n_states: 输入的feature即环境的state数目
n_actions: 输出的action总个数
state_dim: 输入的feature即环境的state数目
action_dim: 输出的action总个数
"""
super(FCN, self).__init__()
self.fc1 = nn.Linear(n_states, 128) # 输入层
self.fc1 = nn.Linear(state_dim, 128) # 输入层
self.fc2 = nn.Linear(128, 128) # 隐藏层
self.fc3 = nn.Linear(128, n_actions) # 输出层
self.fc3 = nn.Linear(128, action_dim) # 输出层

def forward(self, x):
# 各层对应的激活函数
Expand All @@ -66,8 +66,8 @@ class FCN(nn.Module):

```agent.py```中我们定义强化学习算法,包括```choose_action``````update```两个主要函数,初始化中:
```python
self.policy_net = FCN(n_states, n_actions).to(self.device)
self.target_net = FCN(n_states, n_actions).to(self.device)
self.policy_net = FCN(state_dim, action_dim).to(self.device)
self.target_net = FCN(state_dim, action_dim).to(self.device)
# target_net的初始模型参数完全复制policy_net
self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
Expand Down
12 changes: 6 additions & 6 deletions codes/DQN/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
import math
import numpy as np
from common.memory import ReplayBuffer
from common.model import MLP2
from common.model import MLP
class DQN:
def __init__(self, n_states, n_actions, cfg):
def __init__(self, state_dim, action_dim, cfg):

self.n_actions = n_actions # 总的动作个数
self.action_dim = action_dim # 总的动作个数
self.device = cfg.device # 设备,cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
Expand All @@ -34,8 +34,8 @@ def __init__(self, n_states, n_actions, cfg):
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
# target_net的初始模型参数完全复制policy_net
self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
Expand Down Expand Up @@ -64,7 +64,7 @@ def choose_action(self, state, train=True):
# 所以tensor.max(1)[1]返回最大值对应的下标,即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.n_actions)
action = random.randrange(self.action_dim)
return action
else:
with torch.no_grad(): # 取消保存梯度
Expand Down
10 changes: 5 additions & 5 deletions codes/DQN/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
LastEditTime: 2021-03-17 20:35:37
LastEditTime: 2021-03-26 17:17:17
@Discription:
@Environment: python 3.7.7
'''
Expand Down Expand Up @@ -40,7 +40,7 @@ def __init__(self):
self.lr = 0.01 # 学习率
self.memory_capacity = 800 # Replay Memory容量
self.batch_size = 64
self.train_eps = 250 # 训练的episode数目
self.train_eps = 300 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度
self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目
Expand Down Expand Up @@ -84,9 +84,9 @@ def train(cfg,env,agent):
cfg = DQNConfig()
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要
env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = DQN(n_states,n_actions,cfg)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = DQN(state_dim,action_dim,cfg)
rewards,ma_rewards = train(cfg,env,agent)
agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
Expand Down
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2 changes: 2 additions & 0 deletions codes/DQN_cnn/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# DQN with cnn
原理与[DQN](../DQN)相同,只是将神经网络换成卷积神经网络,用于二维观测信息(state或obervation)
39 changes: 39 additions & 0 deletions codes/DoubleDQN/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
食用本篇之前,需要有DQN算法的基础,参考[DQN算法实战](../DQN)

## 原理简介

Double-DQN是2016年提出的算法,灵感源自2010年的Double-Qlearning,可参考论文[Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)
跟Nature DQN一样,Double-DQN也用了两个网络,一个当前网络(对应用$Q$表示),一个目标网络(对应一般用$Q'$表示,为方便区分,以下用$Q_{tar}$代替)。我们先回忆一下,对于非终止状态,目标$Q_{tar}$值计算如下
![在这里插入图片描述](assets/20201222145725907.png)

而在Double-DQN中,不再是直接从目标$Q_{tar}$网络中选择各个动作中的最大$Q_{tar}$值,而是先从当前$Q$网络选择$Q$值最大对应的动作,然后代入到目标网络中计算对应的值:
![在这里插入图片描述](assets/20201222150225327.png)
Double-DQN的好处是Nature DQN中使用max虽然可以快速让Q值向可能的优化目标靠拢,但是很容易过犹不及,导致过度估计(Over Estimation),所谓过度估计就是最终我们得到的算法模型有很大的偏差(bias)。为了解决这个问题, DDQN通过解耦目标Q值动作的选择和目标Q值的计算这两步,来达到消除过度估计的问题,感兴趣可以阅读原论文。

伪代码如下:
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png)
当然也可以两个网络可以同时为当前网络和目标网络,如下:
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png)
或者这样更好理解如何同时为当前网络和目标网络:
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png)

## 代码实战
完整程序见[github](https://github.com/JohnJim0816/reinforcement-learning-tutorials/tree/master/DoubleDQN)。结合上面的原理,其实Double DQN改进来很简单,基本只需要在```update```中修改几行代码,如下:
```python
'''以下是Nature DQN的q_target计算方式
next_q_state_value = self.target_net(
next_state_batch).max(1)[0].detach() # # 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数,比如tensor([ 0.0060, -0.0171,...,])
#计算 q_target
#对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
'''
'''以下是Double DQNq_target计算方式,与NatureDQN稍有不同'''
next_target_values = self.target_net(
next_state_batch)
#选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a))
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch[0])
```
reward变化结果如下:
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png)
其中下边蓝色和红色分别表示Double DQN和Nature DQN在训练中的reward变化图,而上面蓝色和绿色则表示Double DQN和Nature DQN在测试中的reward变化图。
15 changes: 7 additions & 8 deletions codes/DoubleDQN/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2021-03-13 15:01:27
LastEditTime: 2021-03-28 11:07:35
@Discription:
@Environment: python 3.7.7
'''
Expand All @@ -16,16 +16,15 @@
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import math
import numpy as np
from common.memory import ReplayBuffer
from common.model import MLP2
from common.model import MLP
class DoubleDQN:
def __init__(self, n_states, n_actions, cfg):
def __init__(self, state_dim, action_dim, cfg):

self.n_actions = n_actions # 总的动作个数
self.action_dim = action_dim # 总的动作个数
self.device = cfg.device # 设备,cpu或gpu等
self.gamma = cfg.gamma
# e-greedy策略相关参数
Expand All @@ -34,8 +33,8 @@ def __init__(self, n_states, n_actions, cfg):
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
# target_net的初始模型参数完全复制policy_net
self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
Expand Down Expand Up @@ -63,7 +62,7 @@ def choose_action(self, state):
# 所以tensor.max(1)[1]返回最大值对应的下标,即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.n_actions)
action = random.randrange(self.action_dim)
return action
def update(self):

Expand Down
Binary file added codes/DoubleDQN/assets/20201222145725907.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added codes/DoubleDQN/assets/20201222150225327.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 6 additions & 6 deletions codes/DoubleDQN/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
LastEditTime: 2021-03-17 20:11:19
LastEditTime: 2021-03-28 11:05:14
@Discription:
@Environment: python 3.7.7
'''
Expand All @@ -32,15 +32,15 @@

class DoubleDQNConfig:
def __init__(self):
self.algo = "Double DQN" # 算法名称
self.algo = "Double DQN" # name of algo
self.gamma = 0.99
self.epsilon_start = 0.9 # e-greedy策略的初始epsilon
self.epsilon_end = 0.01
self.epsilon_decay = 200
self.lr = 0.01 # 学习率
self.memory_capacity = 10000 # Replay Memory容量
self.batch_size = 128
self.train_eps = 250 # 训练的episode数目
self.train_eps = 300 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度
self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目
Expand Down Expand Up @@ -84,9 +84,9 @@ def train(cfg,env,agent):
cfg = DoubleDQNConfig()
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要
env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = DoubleDQN(n_states,n_actions,cfg)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = DoubleDQN(state_dim,action_dim,cfg)
rewards,ma_rewards = train(cfg,env,agent)
agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
Expand Down
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 6e4d966

Please sign in to comment.