diff --git a/README.md b/README.md index b5c92897..900033a2 100644 --- a/README.md +++ b/README.md @@ -30,23 +30,7 @@ | [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) | | | ## 算法实战 -| 算法名称 | 相关论文材料 | 环境 | 备注 | -| :--------------------------------------: | :---------------------------------------------------------: | ------------------------------------- | :--------------------------------: | -| [On-Policy First-Visit MC](./codes/MonteCarlo) | | [Racetrack](./codes/envs/racetrack_env.md) | | -| [Q-Learning](./codes/QLearning) | | [CliffWalking-v0](./codes/envs/gym_info.md) | | -| [Sarsa](./codes/Sarsa) | | [Racetrack](./codes/envs/racetrack_env.md) | | -| [DQN](./codes/DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./codes/envs/gym_info.md) | | -| DQN-cnn | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./codes/envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 | -| [DoubleDQN](./codes/DoubleDQN) | | [CartPole-v0](./codes/envs/gym_info.md) | 效果不好,待改进 | -| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | | -| [PolicyGradient](./codes/PolicyGradient) | | [CartPole-v0](./codes/envs/gym_info.md) | | -| A2C | | [CartPole-v0](./codes/envs/gym_info.md) | | -| A3C | | | | -| SAC | | | | -| [PPO](./codes/PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./codes/envs/gym_info.md) | | -| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./codes/envs/gym_info.md) | | -| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | | -| GAIL | | | | +[点击](./codes)或者跳转```codes```文件夹下进入算法实战 ## 贡献者 diff --git a/codes/A2C/agent.py b/codes/A2C/agent.py index af1201bb..aafe7c1c 100644 --- a/codes/A2C/agent.py +++ b/codes/A2C/agent.py @@ -13,9 +13,9 @@ import torch.optim as optim class A2C: - def __init__(self,n_states, n_actions, cfg): + def __init__(self,state_dim, action_dim, cfg): self.gamma = 0.99 - self.model = ActorCritic(n_states, n_actions, hidden_dim=cfg.hidden_dim).to(cfg.device) + self.model = ActorCritic(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(cfg.device) self.optimizer = optim.Adam(self.model.parameters(),lr=cfg.lr) def choose_action(self, state): dist, value = self.model(state) diff --git a/codes/A2C/main.py b/codes/A2C/main.py index 08a1e1df..2bff5fc9 100644 --- a/codes/A2C/main.py +++ b/codes/A2C/main.py @@ -95,8 +95,8 @@ def train(cfg,env,agent): cfg = A2CConfig() env = gym.make('CartPole-v0') env.seed(1) # set random seed for env - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - agent = A2C(n_states, n_actions, cfg) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = A2C(state_dim, action_dim, cfg) train(cfg,env,agent) diff --git a/codes/A2C/model.py b/codes/A2C/model.py index 0ceba5e6..46b59dec 100644 --- a/codes/A2C/model.py +++ b/codes/A2C/model.py @@ -13,18 +13,18 @@ from torch.distributions import Categorical class ActorCritic(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim=256): + def __init__(self, state_dim, action_dim, hidden_dim=256): super(ActorCritic, self).__init__() self.critic = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, 1) ) self.actor = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), - nn.Linear(hidden_dim, n_actions), + nn.Linear(hidden_dim, action_dim), nn.Softmax(dim=1), ) diff --git a/codes/DDPG/agent.py b/codes/DDPG/agent.py index 29f34d66..f2860b70 100644 --- a/codes/DDPG/agent.py +++ b/codes/DDPG/agent.py @@ -19,12 +19,12 @@ class DDPG: - def __init__(self, n_states, n_actions, cfg): + def __init__(self, state_dim, action_dim, cfg): self.device = cfg.device - self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) - self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) - self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) - self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) + self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) + self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) + self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) + self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) diff --git a/codes/DDPG/env.py b/codes/DDPG/env.py index ad7bd0e1..85ca81c5 100644 --- a/codes/DDPG/env.py +++ b/codes/DDPG/env.py @@ -41,17 +41,17 @@ def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0. self.max_sigma = max_sigma self.min_sigma = min_sigma self.decay_period = decay_period - self.n_actions = action_space.shape[0] + self.action_dim = action_space.shape[0] self.low = action_space.low self.high = action_space.high self.reset() def reset(self): - self.obs = np.ones(self.n_actions) * self.mu + self.obs = np.ones(self.action_dim) * self.mu def evolve_obs(self): x = self.obs - dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions) + dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim) self.obs = x + dx return self.obs diff --git a/codes/DDPG/main.py b/codes/DDPG/main.py index 5308ec60..bee9d214 100644 --- a/codes/DDPG/main.py +++ b/codes/DDPG/main.py @@ -82,9 +82,9 @@ def train(cfg,env,agent): cfg = DDPGConfig() env = NormalizedActions(gym.make("Pendulum-v0")) env.seed(1) # 设置env随机种子 - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] - agent = DDPG(n_states,n_actions,cfg) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + agent = DDPG(state_dim,action_dim,cfg) rewards,ma_rewards = train(cfg,env,agent) agent.save(path=SAVED_MODEL_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) diff --git a/codes/DQN/README.md b/codes/DQN/README.md index ca3d6a9f..9eb22469 100644 --- a/codes/DQN/README.md +++ b/codes/DQN/README.md @@ -46,15 +46,15 @@ import torch.nn as nn import torch.nn.functional as F class FCN(nn.Module): - def __init__(self, n_states=4, n_actions=18): + def __init__(self, state_dim=4, action_dim=18): """ 初始化q网络,为全连接网络 - n_states: 输入的feature即环境的state数目 - n_actions: 输出的action总个数 + state_dim: 输入的feature即环境的state数目 + action_dim: 输出的action总个数 """ super(FCN, self).__init__() - self.fc1 = nn.Linear(n_states, 128) # 输入层 + self.fc1 = nn.Linear(state_dim, 128) # 输入层 self.fc2 = nn.Linear(128, 128) # 隐藏层 - self.fc3 = nn.Linear(128, n_actions) # 输出层 + self.fc3 = nn.Linear(128, action_dim) # 输出层 def forward(self, x): # 各层对应的激活函数 @@ -66,8 +66,8 @@ class FCN(nn.Module): 在```agent.py```中我们定义强化学习算法,包括```choose_action```和```update```两个主要函数,初始化中: ```python -self.policy_net = FCN(n_states, n_actions).to(self.device) -self.target_net = FCN(n_states, n_actions).to(self.device) +self.policy_net = FCN(state_dim, action_dim).to(self.device) +self.target_net = FCN(state_dim, action_dim).to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout diff --git a/codes/DQN/agent.py b/codes/DQN/agent.py index 299a4c9f..2b561751 100644 --- a/codes/DQN/agent.py +++ b/codes/DQN/agent.py @@ -20,11 +20,11 @@ import math import numpy as np from common.memory import ReplayBuffer -from common.model import MLP2 +from common.model import MLP class DQN: - def __init__(self, n_states, n_actions, cfg): + def __init__(self, state_dim, action_dim, cfg): - self.n_actions = n_actions # 总的动作个数 + self.action_dim = action_dim # 总的动作个数 self.device = cfg.device # 设备,cpu或gpu等 self.gamma = cfg.gamma # 奖励的折扣因子 # e-greedy策略相关参数 @@ -34,8 +34,8 @@ def __init__(self, n_states, n_actions, cfg): self.epsilon_end = cfg.epsilon_end self.epsilon_decay = cfg.epsilon_decay self.batch_size = cfg.batch_size - self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) - self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) + self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) + self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout @@ -64,7 +64,7 @@ def choose_action(self, state, train=True): # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() else: - action = random.randrange(self.n_actions) + action = random.randrange(self.action_dim) return action else: with torch.no_grad(): # 取消保存梯度 diff --git a/codes/DQN/main.py b/codes/DQN/main.py index dae9c86e..a6a998e2 100644 --- a/codes/DQN/main.py +++ b/codes/DQN/main.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:48:57 @LastEditor: John -LastEditTime: 2021-03-17 20:35:37 +LastEditTime: 2021-03-26 17:17:17 @Discription: @Environment: python 3.7.7 ''' @@ -40,7 +40,7 @@ def __init__(self): self.lr = 0.01 # 学习率 self.memory_capacity = 800 # Replay Memory容量 self.batch_size = 64 - self.train_eps = 250 # 训练的episode数目 + self.train_eps = 300 # 训练的episode数目 self.train_steps = 200 # 训练每个episode的最大长度 self.target_update = 2 # target net的更新频率 self.eval_eps = 20 # 测试的episode数目 @@ -84,9 +84,9 @@ def train(cfg,env,agent): cfg = DQNConfig() env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要 env.seed(1) # 设置env随机种子 - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - agent = DQN(n_states,n_actions,cfg) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = DQN(state_dim,action_dim,cfg) rewards,ma_rewards = train(cfg,env,agent) agent.save(path=SAVED_MODEL_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) diff --git a/codes/DQN/results/20210326-171704/ma_rewards_train.npy b/codes/DQN/results/20210326-171704/ma_rewards_train.npy new file mode 100644 index 00000000..2f231bb4 Binary files /dev/null and b/codes/DQN/results/20210326-171704/ma_rewards_train.npy differ diff --git a/codes/DQN/results/20210326-171704/rewards_curve_train.png b/codes/DQN/results/20210326-171704/rewards_curve_train.png new file mode 100644 index 00000000..0f289b23 Binary files /dev/null and b/codes/DQN/results/20210326-171704/rewards_curve_train.png differ diff --git a/codes/DQN/results/20210326-171704/rewards_train.npy b/codes/DQN/results/20210326-171704/rewards_train.npy new file mode 100644 index 00000000..9933915c Binary files /dev/null and b/codes/DQN/results/20210326-171704/rewards_train.npy differ diff --git a/codes/DQN/results/20210326-171722/ma_rewards_train.npy b/codes/DQN/results/20210326-171722/ma_rewards_train.npy new file mode 100644 index 00000000..1d9ea32f Binary files /dev/null and b/codes/DQN/results/20210326-171722/ma_rewards_train.npy differ diff --git a/codes/DQN/results/20210326-171722/rewards_curve_train.png b/codes/DQN/results/20210326-171722/rewards_curve_train.png new file mode 100644 index 00000000..e900e9c7 Binary files /dev/null and b/codes/DQN/results/20210326-171722/rewards_curve_train.png differ diff --git a/codes/DQN/results/20210326-171722/rewards_train.npy b/codes/DQN/results/20210326-171722/rewards_train.npy new file mode 100644 index 00000000..0351d733 Binary files /dev/null and b/codes/DQN/results/20210326-171722/rewards_train.npy differ diff --git a/codes/DQN/saved_model/20210326-171704/dqn_checkpoint.pth b/codes/DQN/saved_model/20210326-171704/dqn_checkpoint.pth new file mode 100644 index 00000000..567518ae Binary files /dev/null and b/codes/DQN/saved_model/20210326-171704/dqn_checkpoint.pth differ diff --git a/codes/DQN/saved_model/20210326-171722/dqn_checkpoint.pth b/codes/DQN/saved_model/20210326-171722/dqn_checkpoint.pth new file mode 100644 index 00000000..b460976a Binary files /dev/null and b/codes/DQN/saved_model/20210326-171722/dqn_checkpoint.pth differ diff --git a/codes/DQN_cnn/README.md b/codes/DQN_cnn/README.md new file mode 100644 index 00000000..4d1be2a8 --- /dev/null +++ b/codes/DQN_cnn/README.md @@ -0,0 +1,2 @@ +# DQN with cnn +原理与[DQN](../DQN)相同,只是将神经网络换成卷积神经网络,用于二维观测信息(state或obervation) \ No newline at end of file diff --git a/codes/DoubleDQN/README.md b/codes/DoubleDQN/README.md new file mode 100644 index 00000000..714bd26e --- /dev/null +++ b/codes/DoubleDQN/README.md @@ -0,0 +1,39 @@ +食用本篇之前,需要有DQN算法的基础,参考[DQN算法实战](../DQN)。 + +## 原理简介 + +Double-DQN是2016年提出的算法,灵感源自2010年的Double-Qlearning,可参考论文[Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)。 +跟Nature DQN一样,Double-DQN也用了两个网络,一个当前网络(对应用$Q$表示),一个目标网络(对应一般用$Q'$表示,为方便区分,以下用$Q_{tar}$代替)。我们先回忆一下,对于非终止状态,目标$Q_{tar}$值计算如下 +![在这里插入图片描述](assets/20201222145725907.png) + +而在Double-DQN中,不再是直接从目标$Q_{tar}$网络中选择各个动作中的最大$Q_{tar}$值,而是先从当前$Q$网络选择$Q$值最大对应的动作,然后代入到目标网络中计算对应的值: +![在这里插入图片描述](assets/20201222150225327.png) +Double-DQN的好处是Nature DQN中使用max虽然可以快速让Q值向可能的优化目标靠拢,但是很容易过犹不及,导致过度估计(Over Estimation),所谓过度估计就是最终我们得到的算法模型有很大的偏差(bias)。为了解决这个问题, DDQN通过解耦目标Q值动作的选择和目标Q值的计算这两步,来达到消除过度估计的问题,感兴趣可以阅读原论文。 + +伪代码如下: +![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png) +当然也可以两个网络可以同时为当前网络和目标网络,如下: +![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png) +或者这样更好理解如何同时为当前网络和目标网络: +![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png) + +## 代码实战 +完整程序见[github](https://github.com/JohnJim0816/reinforcement-learning-tutorials/tree/master/DoubleDQN)。结合上面的原理,其实Double DQN改进来很简单,基本只需要在```update```中修改几行代码,如下: +```python +'''以下是Nature DQN的q_target计算方式 +next_q_state_value = self.target_net( +next_state_batch).max(1)[0].detach() # # 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数,比如tensor([ 0.0060, -0.0171,...,]) +#计算 q_target +#对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward +q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0]) +''' +'''以下是Double DQNq_target计算方式,与NatureDQN稍有不同''' +next_target_values = self.target_net( +next_state_batch) +#选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a)) +next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) +q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch[0]) +``` +reward变化结果如下: +![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png) +其中下边蓝色和红色分别表示Double DQN和Nature DQN在训练中的reward变化图,而上面蓝色和绿色则表示Double DQN和Nature DQN在测试中的reward变化图。 \ No newline at end of file diff --git a/codes/DoubleDQN/agent.py b/codes/DoubleDQN/agent.py index 1f9c7c16..34774c4d 100644 --- a/codes/DoubleDQN/agent.py +++ b/codes/DoubleDQN/agent.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:50:49 @LastEditor: John -LastEditTime: 2021-03-13 15:01:27 +LastEditTime: 2021-03-28 11:07:35 @Discription: @Environment: python 3.7.7 ''' @@ -16,16 +16,15 @@ import torch import torch.nn as nn import torch.optim as optim -import torch.nn.functional as F import random import math import numpy as np from common.memory import ReplayBuffer -from common.model import MLP2 +from common.model import MLP class DoubleDQN: - def __init__(self, n_states, n_actions, cfg): + def __init__(self, state_dim, action_dim, cfg): - self.n_actions = n_actions # 总的动作个数 + self.action_dim = action_dim # 总的动作个数 self.device = cfg.device # 设备,cpu或gpu等 self.gamma = cfg.gamma # e-greedy策略相关参数 @@ -34,8 +33,8 @@ def __init__(self, n_states, n_actions, cfg): self.epsilon_end = cfg.epsilon_end self.epsilon_decay = cfg.epsilon_decay self.batch_size = cfg.batch_size - self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) - self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) + self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) + self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout @@ -63,7 +62,7 @@ def choose_action(self, state): # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() else: - action = random.randrange(self.n_actions) + action = random.randrange(self.action_dim) return action def update(self): diff --git a/codes/DoubleDQN/assets/20201222145725907.png b/codes/DoubleDQN/assets/20201222145725907.png new file mode 100644 index 00000000..d2cbb2d3 Binary files /dev/null and b/codes/DoubleDQN/assets/20201222145725907.png differ diff --git a/codes/DoubleDQN/assets/20201222150225327.png b/codes/DoubleDQN/assets/20201222150225327.png new file mode 100644 index 00000000..20b79be3 Binary files /dev/null and b/codes/DoubleDQN/assets/20201222150225327.png differ diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png new file mode 100644 index 00000000..427a9034 Binary files /dev/null and b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png differ diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png new file mode 100644 index 00000000..d95f900b Binary files /dev/null and b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png differ diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png new file mode 100644 index 00000000..ddeda962 Binary files /dev/null and b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png differ diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png new file mode 100644 index 00000000..dec19e50 Binary files /dev/null and b/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png differ diff --git a/codes/DoubleDQN/main.py b/codes/DoubleDQN/main.py index 88add9af..57c9f9c4 100644 --- a/codes/DoubleDQN/main.py +++ b/codes/DoubleDQN/main.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:48:57 @LastEditor: John -LastEditTime: 2021-03-17 20:11:19 +LastEditTime: 2021-03-28 11:05:14 @Discription: @Environment: python 3.7.7 ''' @@ -32,7 +32,7 @@ class DoubleDQNConfig: def __init__(self): - self.algo = "Double DQN" # 算法名称 + self.algo = "Double DQN" # name of algo self.gamma = 0.99 self.epsilon_start = 0.9 # e-greedy策略的初始epsilon self.epsilon_end = 0.01 @@ -40,7 +40,7 @@ def __init__(self): self.lr = 0.01 # 学习率 self.memory_capacity = 10000 # Replay Memory容量 self.batch_size = 128 - self.train_eps = 250 # 训练的episode数目 + self.train_eps = 300 # 训练的episode数目 self.train_steps = 200 # 训练每个episode的最大长度 self.target_update = 2 # target net的更新频率 self.eval_eps = 20 # 测试的episode数目 @@ -84,9 +84,9 @@ def train(cfg,env,agent): cfg = DoubleDQNConfig() env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要 env.seed(1) # 设置env随机种子 - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - agent = DoubleDQN(n_states,n_actions,cfg) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = DoubleDQN(state_dim,action_dim,cfg) rewards,ma_rewards = train(cfg,env,agent) agent.save(path=SAVED_MODEL_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) diff --git a/codes/DoubleDQN/results/20210328-110516/ma_rewards_train.npy b/codes/DoubleDQN/results/20210328-110516/ma_rewards_train.npy new file mode 100644 index 00000000..1c4be2b5 Binary files /dev/null and b/codes/DoubleDQN/results/20210328-110516/ma_rewards_train.npy differ diff --git a/codes/DoubleDQN/results/20210328-110516/rewards_curve_train.png b/codes/DoubleDQN/results/20210328-110516/rewards_curve_train.png new file mode 100644 index 00000000..2817223d Binary files /dev/null and b/codes/DoubleDQN/results/20210328-110516/rewards_curve_train.png differ diff --git a/codes/DoubleDQN/results/20210328-110516/rewards_train.npy b/codes/DoubleDQN/results/20210328-110516/rewards_train.npy new file mode 100644 index 00000000..73acfde5 Binary files /dev/null and b/codes/DoubleDQN/results/20210328-110516/rewards_train.npy differ diff --git a/codes/DoubleDQN/saved_model/20210328-110516/DoubleDQN_checkpoint.pth b/codes/DoubleDQN/saved_model/20210328-110516/DoubleDQN_checkpoint.pth new file mode 100644 index 00000000..69f5fce4 Binary files /dev/null and b/codes/DoubleDQN/saved_model/20210328-110516/DoubleDQN_checkpoint.pth differ diff --git a/codes/HierarchicalDQN/agent.py b/codes/HierarchicalDQN/agent.py new file mode 100644 index 00000000..84e79e07 --- /dev/null +++ b/codes/HierarchicalDQN/agent.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2021-03-24 22:18:18 +LastEditor: John +LastEditTime: 2021-03-27 04:24:30 +Discription: +Environment: +''' +import torch +import torch.nn as nn +import numpy as np +import random,math +from HierarchicalDQN.model import MLP +from common.memory import ReplayBuffer +import torch.optim as optim +class HierarchicalDQN: + def __init__(self,state_dim,action_dim,cfg): + self.action_dim = action_dim + self.device = cfg.device + self.batch_size = cfg.batch_size + self.sample_count = 0 + self.epsilon = 0 + self.epsilon_start = cfg.epsilon_start + self.epsilon_end = cfg.epsilon_end + self.epsilon_decay = cfg.epsilon_decay + self.batch_size = cfg.batch_size + self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device) + self.target_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device) + self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device) + self.meta_target_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device) + self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr) + self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr) + self.memory = ReplayBuffer(cfg.memory_capacity) + self.meta_memory = ReplayBuffer(cfg.memory_capacity) + def to_onehot(x): + oh = np.zeros(6) + oh[x - 1] = 1. + return oh + def set_goal(self,meta_state): + self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay) + self.sample_count += 1 + if random.random() > self.epsilon: + with torch.no_grad(): + meta_state = torch.tensor([meta_state], device=self.device, dtype=torch.float32) + q_value = self.policy_net(meta_state) + goal = q_value.max(1)[1].item() + else: + goal = random.randrange(self.action_dim) + goal = self.meta_policy_net(meta_state) + onehot_goal = self.to_onehot(goal) + return onehot_goal + def choose_action(self,state): + self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay) + self.sample_count += 1 + if random.random() > self.epsilon: + with torch.no_grad(): + state = torch.tensor([state], device=self.device, dtype=torch.float32) + q_value = self.policy_net(state) + action = q_value.max(1)[1].item() + else: + action = random.randrange(self.action_dim) + return action + def update(self): + if self.batch_size > len(self.memory): + state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size) + state_batch = torch.tensor( + state_batch, device=self.device, dtype=torch.float) + action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) + reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float) + next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) + done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) + q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) + next_state_values = self.target_net(next_state_batch).max(1)[0].detach() + expected_q_values = reward_batch + self.gamma * next_state_values * (1-done_batch[0]) + loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) + self.optimizer.zero_grad() + loss.backward() + for param in self.policy_net.parameters(): + param.grad.data.clamp_(-1, 1) + self.optimizer.step() + + if self.batch_size > len(self.meta_memory): + meta_state_batch, meta_action_batch, meta_reward_batch, next_meta_state_batch, meta_done_batch = self.memory.sample(self.batch_size) + meta_state_batch = torch.tensor(meta_state_batch, device=self.device, dtype=torch.float) + meta_action_batch = torch.tensor(meta_action_batch, device=self.device).unsqueeze(1) + meta_reward_batch = torch.tensor(meta_reward_batch, device=self.device, dtype=torch.float) + next_meta_state_batch = torch.tensor(next_meta_state_batch, device=self.device, dtype=torch.float) + meta_done_batch = torch.tensor(np.float32(meta_done_batch), device=self.device).unsqueeze(1) + meta_q_values = self.meta_policy_net(meta_state_batch).gather(dim=1, index=meta_action_batch) + next_state_values = self.target_net(next_meta_state_batch).max(1)[0].detach() + expected_meta_q_values = meta_reward_batch + self.gamma * next_state_values * (1-meta_done_batch[0]) + meta_loss = nn.MSEmeta_loss()(meta_q_values, expected_meta_q_values.unsqueeze(1)) + self.meta_optimizer.zero_grad() + meta_loss.backward() + for param in self.meta_policy_net.parameters(): + param.grad.data.clamp_(-1, 1) + self.meta_optimizer.step() + + \ No newline at end of file diff --git a/codes/HierarchicalDQN/main.py b/codes/HierarchicalDQN/main.py new file mode 100644 index 00000000..5ecd02f6 --- /dev/null +++ b/codes/HierarchicalDQN/main.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2021-03-24 22:14:04 +LastEditor: John +LastEditTime: 2021-03-27 04:23:43 +Discription: +Environment: +''' +import sys,os +sys.path.append(os.getcwd()) # add current terminal path to sys.path +import gym +import numpy as np +import torch +import datetime +from HierarchicalDQN.agent import HierarchicalDQN +from common.plot import plot_rewards +from common.utils import save_results + +SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time +SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # path to save model +if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): + os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/") +if not os.path.exists(SAVED_MODEL_PATH): + os.mkdir(SAVED_MODEL_PATH) +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards +if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): + os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/") +if not os.path.exists(RESULT_PATH): + os.mkdir(RESULT_PATH) + +class HierarchicalDQNConfig: + def __init__(self): + self.algo = "DQN" # name of algo + self.gamma = 0.99 + self.epsilon_start = 0.95 # start epsilon of e-greedy policy + self.epsilon_end = 0.01 + self.epsilon_decay = 200 + self.lr = 0.01 # learning rate + self.memory_capacity = 800 # Replay Memory capacity + self.batch_size = 64 + self.train_eps = 250 # 训练的episode数目 + self.train_steps = 200 # 训练每个episode的最大长度 + self.target_update = 2 # target net的更新频率 + self.eval_eps = 20 # 测试的episode数目 + self.eval_steps = 200 # 测试每个episode的最大长度 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu + self.hidden_dim = 256 # dimension of hidden layer + +def train(cfg,env,agent): + print('Start to train !') + rewards = [] + ma_rewards = [] # moving average reward + ep_steps = [] + for i_episode in range(cfg.train_eps): + state = env.reset() + extrinsic_reward = 0 + for i_step in range(cfg.train_steps): + goal= agent.set_goal(state) + meta_state = state + goal_state = np.concatenate([state, goal]) + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) + extrinsic_reward += reward + intrinsic_reward = 1.0 if goal == np.argmax(next_state) else 0.0 + agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, goal]), done) + state = next_state + agent.update() + if done: + break + if i_episode % cfg.target_update == 0: + agent.target_net.load_state_dict(agent.policy_net.state_dict()) + print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,extrinsic_reward,i_step+1,done)) + ep_steps.append(i_step) + rewards.append(extrinsic_reward) + if ma_rewards: + ma_rewards.append( + 0.9*ma_rewards[-1]+0.1*extrinsic_reward) + else: + ma_rewards.append(extrinsic_reward) + agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done) + print('Complete training!') + return rewards,ma_rewards + +if __name__ == "__main__": + cfg = HierarchicalDQNConfig() + env = gym.make('CartPole-v0') + env.seed(1) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = HierarchicalDQN(state_dim,action_dim,cfg) + rewards,ma_rewards = train(cfg,env,agent) + agent.save(path=SAVED_MODEL_PATH) + save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) + plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH) \ No newline at end of file diff --git a/codes/HierarchicalDQN/model.py b/codes/HierarchicalDQN/model.py new file mode 100644 index 00000000..0bf05842 --- /dev/null +++ b/codes/HierarchicalDQN/model.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2021-03-24 22:14:12 +LastEditor: John +LastEditTime: 2021-03-24 22:17:09 +Discription: +Environment: +''' +import torch.nn as nn +import torch.nn.functional as F +class MLP(nn.Module): + def __init__(self, state_dim,action_dim,hidden_dim=128): + super(MLP, self).__init__() + self.fc1 = nn.Linear(state_dim, hidden_dim) + self.fc2 = nn.Linear(hidden_dim,hidden_dim) + self.fc3 = nn.Linear(hidden_dim, action_dim) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return self.fc3(x) \ No newline at end of file diff --git a/codes/MonteCarlo/agent.py b/codes/MonteCarlo/agent.py index 1484049d..3ec4d7a9 100644 --- a/codes/MonteCarlo/agent.py +++ b/codes/MonteCarlo/agent.py @@ -16,11 +16,11 @@ class FisrtVisitMC: ''' On-Policy First-Visit MC Control ''' - def __init__(self,n_actions,cfg): - self.n_actions = n_actions + def __init__(self,action_dim,cfg): + self.action_dim = action_dim self.epsilon = cfg.epsilon self.gamma = cfg.gamma - self.Q = defaultdict(lambda: np.zeros(n_actions)) + self.Q = defaultdict(lambda: np.zeros(action_dim)) self.returns_sum = defaultdict(float) # sum of returns self.returns_count = defaultdict(float) @@ -28,11 +28,11 @@ def choose_action(self,state): ''' e-greed policy ''' if state in self.Q.keys(): best_action = np.argmax(self.Q[state]) - action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions + action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim action_probs[best_action] += (1.0 - self.epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) else: - action = np.random.randint(0,self.n_actions) + action = np.random.randint(0,self.action_dim) return action def update(self,one_ep_transition): # Find all (state, action) pairs we've visited in this one_ep_transition diff --git a/codes/MonteCarlo/main.py b/codes/MonteCarlo/main.py index bdd5ca40..c9844751 100644 --- a/codes/MonteCarlo/main.py +++ b/codes/MonteCarlo/main.py @@ -79,8 +79,8 @@ def mc_train(cfg,env,agent): if __name__ == "__main__": mc_cfg = MCConfig() env = RacetrackEnv() - n_actions=9 - agent = FisrtVisitMC(n_actions,mc_cfg) + action_dim=9 + agent = FisrtVisitMC(action_dim,mc_cfg) rewards,ma_rewards= mc_train(mc_cfg,env,agent) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC Control",path=RESULT_PATH) diff --git a/codes/PolicyGradient/agent.py b/codes/PolicyGradient/agent.py index fdc805ca..997f4aed 100644 --- a/codes/PolicyGradient/agent.py +++ b/codes/PolicyGradient/agent.py @@ -17,9 +17,9 @@ class PolicyGradient: - def __init__(self, n_states,cfg): + def __init__(self, state_dim,cfg): self.gamma = cfg.gamma - self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim) + self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim) self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr) self.batch_size = cfg.batch_size diff --git a/codes/PolicyGradient/main.py b/codes/PolicyGradient/main.py index a4c2a08b..a35be7a9 100644 --- a/codes/PolicyGradient/main.py +++ b/codes/PolicyGradient/main.py @@ -80,9 +80,9 @@ def train(cfg,env,agent): cfg = PGConfig() env = gym.make('CartPole-v0') # 可google为什么unwrapped gym,此处一般不需要 env.seed(1) # 设置env随机种子 - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - agent = PolicyGradient(n_states,cfg) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = PolicyGradient(state_dim,cfg) rewards, ma_rewards = train(cfg,env,agent) agent.save_model(SAVED_MODEL_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) diff --git a/codes/PolicyGradient/model.py b/codes/PolicyGradient/model.py index 799affad..7f5b1a85 100644 --- a/codes/PolicyGradient/model.py +++ b/codes/PolicyGradient/model.py @@ -16,10 +16,10 @@ class MLP(nn.Module): 输入:state维度 输出:概率 ''' - def __init__(self,n_states,hidden_dim = 36): + def __init__(self,state_dim,hidden_dim = 36): super(MLP, self).__init__() - # 24和36为hidden layer的层数,可根据state_dim, n_actions的情况来改变 - self.fc1 = nn.Linear(n_states, hidden_dim) + # 24和36为hidden layer的层数,可根据state_dim, action_dim的情况来改变 + self.fc1 = nn.Linear(state_dim, hidden_dim) self.fc2 = nn.Linear(hidden_dim,hidden_dim) self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left diff --git a/codes/QLearning/agent.py b/codes/QLearning/agent.py index f4a793a4..2d2cb974 100644 --- a/codes/QLearning/agent.py +++ b/codes/QLearning/agent.py @@ -5,7 +5,7 @@ Email: johnjim0816@gmail.com Date: 2020-09-11 23:03:00 LastEditor: John -LastEditTime: 2021-03-12 16:48:25 +LastEditTime: 2021-03-26 16:51:01 Discription: Environment: ''' @@ -16,39 +16,35 @@ class QLearning(object): def __init__(self, - n_actions,cfg): - self.n_actions = n_actions # number of actions + action_dim,cfg): + self.action_dim = action_dim # dimension of acgtion self.lr = cfg.lr # learning rate self.gamma = cfg.gamma self.epsilon = 0 - self.sample_count = 0 # epsilon随训练的也就是采样次数逐渐衰减,所以需要计数 + self.sample_count = 0 self.epsilon_start = cfg.epsilon_start self.epsilon_end = cfg.epsilon_end self.epsilon_decay = cfg.epsilon_decay - self.Q_table = defaultdict(lambda: np.zeros(n_actions)) # 使用字典存储Q表,个人比较喜欢这种,也可以用下面一行的二维数组表示,但是需要额外更改代码 - # self.Q_table = np.zeros((n_states, n_actions)) # Q表 + self.Q_table = defaultdict(lambda: np.zeros(action_dim)) # A nested dictionary that maps state -> (action -> action-value) def choose_action(self, state): self.sample_count += 1 self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.sample_count / self.epsilon_decay) - # 随机选取0-1之间的值,如果大于epsilon就按照贪心策略选取action,否则随机选取 + # e-greedy policy if np.random.uniform(0, 1) > self.epsilon: - action = np.argmax(self.Q_table[state]) + action = np.argmax(self.Q_table[str(state)]) else: - action = np.random.choice(self.n_actions) # 有一定概率随机探索选取一个动作 + action = np.random.choice(self.action_dim) return action def update(self, state, action, reward, next_state, done): - Q_predict = self.Q_table[state][action] + Q_predict = self.Q_table[str(state)][action] if done: Q_target = reward # terminal state else: - Q_target = reward + self.gamma * np.max( - self.Q_table[next_state]) # Q_table-learning - self.Q_table[state][action] += self.lr * (Q_target - Q_predict) + Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)]) + self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict) def save(self,path): - '''把 Q表格 的数据保存到文件中 - ''' import dill torch.save( obj=self.Q_table, @@ -56,7 +52,5 @@ def save(self,path): pickle_module=dill ) def load(self, path): - '''从文件中读取数据到 Q表格 - ''' import dill self.Q_table =torch.load(f=path+'Qleaning_model.pkl',pickle_module=dill) \ No newline at end of file diff --git a/codes/QLearning/main.py b/codes/QLearning/main.py index 27a0934f..bf03ce9c 100644 --- a/codes/QLearning/main.py +++ b/codes/QLearning/main.py @@ -5,7 +5,7 @@ Email: johnjim0816@gmail.com Date: 2020-09-11 23:03:00 LastEditor: John -LastEditTime: 2021-03-12 21:16:50 +LastEditTime: 2021-03-26 17:16:07 Discription: Environment: ''' @@ -35,20 +35,18 @@ class QlearningConfig: '''训练相关参数''' def __init__(self): - self.n_episodes = 200 # 训练的episode数目 + self.train_eps = 200 # 训练的episode数目 self.gamma = 0.9 # reward的衰减率 self.epsilon_start = 0.99 # e-greedy策略中初始epsilon self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon self.epsilon_decay = 200 # e-greedy策略中epsilon的衰减率 - self.lr = 0.1 # 学习率 + self.lr = 0.1 # learning rate def train(cfg,env,agent): - # env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up - # env = FrozenLakeWapper(env) - rewards = [] # 记录所有episode的reward - ma_rewards = [] # 滑动平均的reward + rewards = [] + ma_rewards = [] # moving average reward steps = [] # 记录所有episode的steps - for i_episode in range(cfg.n_episodes): + for i_episode in range(cfg.train_eps): ep_reward = 0 # 记录每个episode的reward ep_steps = 0 # 记录每个episode走了多少step state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) @@ -63,12 +61,11 @@ def train(cfg,env,agent): break steps.append(ep_steps) rewards.append(ep_reward) - # 计算滑动平均的reward if ma_rewards: ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) else: ma_rewards.append(ep_reward) - print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.n_episodes,ep_reward)) + print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.train_eps,ep_reward)) return rewards,ma_rewards def eval(cfg,env,agent): @@ -77,7 +74,7 @@ def eval(cfg,env,agent): rewards = [] # 记录所有episode的reward ma_rewards = [] # 滑动平均的reward steps = [] # 记录所有episode的steps - for i_episode in range(cfg.n_episodes): + for i_episode in range(cfg.train_eps): ep_reward = 0 # 记录每个episode的reward ep_steps = 0 # 记录每个episode走了多少step state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) @@ -96,15 +93,15 @@ def eval(cfg,env,agent): ma_rewards.append(rewards[-1]*0.9+ep_reward*0.1) else: ma_rewards.append(ep_reward) - print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.n_episodes,ep_reward)) + print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.train_eps,ep_reward)) return rewards,ma_rewards if __name__ == "__main__": cfg = QlearningConfig() env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left env = CliffWalkingWapper(env) - n_actions = env.action_space.n - agent = QLearning(n_actions,cfg) + action_dim = env.action_space.n + agent = QLearning(action_dim,cfg) rewards,ma_rewards = train(cfg,env,agent) agent.save(path=SAVED_MODEL_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) diff --git a/codes/QLearning/results/20210326-171621/ma_rewards_train.npy b/codes/QLearning/results/20210326-171621/ma_rewards_train.npy new file mode 100644 index 00000000..0f842f2b Binary files /dev/null and b/codes/QLearning/results/20210326-171621/ma_rewards_train.npy differ diff --git a/codes/QLearning/results/20210326-171621/rewards_curve_train.png b/codes/QLearning/results/20210326-171621/rewards_curve_train.png new file mode 100644 index 00000000..985b8c7b Binary files /dev/null and b/codes/QLearning/results/20210326-171621/rewards_curve_train.png differ diff --git a/codes/QLearning/results/20210326-171621/rewards_train.npy b/codes/QLearning/results/20210326-171621/rewards_train.npy new file mode 100644 index 00000000..ed8f5240 Binary files /dev/null and b/codes/QLearning/results/20210326-171621/rewards_train.npy differ diff --git a/codes/QLearning/saved_model/20210326-171621/Qleaning_model.pkl b/codes/QLearning/saved_model/20210326-171621/Qleaning_model.pkl new file mode 100644 index 00000000..47e72796 Binary files /dev/null and b/codes/QLearning/saved_model/20210326-171621/Qleaning_model.pkl differ diff --git a/codes/README_en.md b/codes/README_en.md new file mode 100644 index 00000000..c931b6a2 --- /dev/null +++ b/codes/README_en.md @@ -0,0 +1,57 @@ + + +[Eng](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README_en.md)|[中文](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README.md) + +## Introduction + +This repo is used to learn basic RL algorithms, we will make it **detailed comment** and **clear structure** as much as possible: + +The code structure mainly contains several scripts as following: + +* ```model.py``` basic network model of RL, like MLP, CNN +* ```memory.py``` Replay Buffer +* ```plot.py``` use seaborn to plot rewards curve,saved in folder ``` result```. +* ```env.py``` to custom or normalize environments +* ```agent.py``` core algorithms, include a python Class with functions(choose action, update) +* ```main.py``` main function + + + +Note that ```model.py```,```memory.py```,```plot.py``` shall be utilized in different algorithms,thus they are put into ```common``` folder。 + +## Runnig Environment + +python 3.7.9、pytorch 1.6.0、gym 0.18.0 +## Usage + +Environment infomations see [环境说明](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/env_info.md) + +## Schedule + +| Name | Related materials | Used Envs | Notes | +| :----------------------------------------------------------: | :---------------------------------------------------------: | ------------------------------------------------------------ | :----------------------------------------------------------: | +| [On-Policy First-Visit MC](./MonteCarlo) | | [Racetrack](./envs/racetrack_env.md) | | +| [Q-Learning](./QLearning) | | [CliffWalking-v0](./envs/gym_info.md) | | +| [Sarsa](./Sarsa) | | [Racetrack](./envs/racetrack_env.md) | | +| [DQN](./DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | | +| [DQN-cnn](./DQN_cnn) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | | +| [DoubleDQN](./DoubleDQN) | | [CartPole-v0](./envs/gym_info.md) | not well | +| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | | +| [PolicyGradient](./PolicyGradient) | | [CartPole-v0](./envs/gym_info.md) | | +| A2C | | [CartPole-v0](./envs/gym_info.md) | | +| A3C | | | | +| SAC | | | | +| [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | | +| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | | +| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | | +| GAIL | | | | + + +## Refs + + +[RL-Adventure-2](https://github.com/higgsfield/RL-Adventure-2) + +[RL-Adventure](https://github.com/higgsfield/RL-Adventure) + +https://www.cnblogs.com/lucifer1997/p/13458563.html diff --git a/codes/Sarsa/agent.py b/codes/Sarsa/agent.py index 37533812..020f6da8 100644 --- a/codes/Sarsa/agent.py +++ b/codes/Sarsa/agent.py @@ -14,17 +14,17 @@ import torch class Sarsa(object): def __init__(self, - n_actions,sarsa_cfg,): - self.n_actions = n_actions # number of actions + action_dim,sarsa_cfg,): + self.action_dim = action_dim # number of actions self.lr = sarsa_cfg.lr # learning rate self.gamma = sarsa_cfg.gamma self.epsilon = sarsa_cfg.epsilon - self.Q = defaultdict(lambda: np.zeros(n_actions)) - # self.Q = np.zeros((n_states, n_actions)) # Q表 + self.Q = defaultdict(lambda: np.zeros(action_dim)) + # self.Q = np.zeros((state_dim, action_dim)) # Q表 def choose_action(self, state): best_action = np.argmax(self.Q[state]) # action = best_action - action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions + action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim action_probs[best_action] += (1.0 - self.epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) return action diff --git a/codes/Sarsa/main.py b/codes/Sarsa/main.py index 0bc976f5..a2363edb 100644 --- a/codes/Sarsa/main.py +++ b/codes/Sarsa/main.py @@ -70,8 +70,8 @@ def sarsa_train(cfg,env,agent): if __name__ == "__main__": sarsa_cfg = SarsaConfig() env = RacetrackEnv() - n_actions=9 - agent = Sarsa(n_actions,sarsa_cfg) + action_dim=9 + agent = Sarsa(action_dim,sarsa_cfg) rewards,ma_rewards = sarsa_train(sarsa_cfg,env,agent) agent.save(path=SAVED_MODEL_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) diff --git a/codes/common/model.py b/codes/common/model.py index 008c39c1..e02e3c19 100644 --- a/codes/common/model.py +++ b/codes/common/model.py @@ -5,7 +5,7 @@ Email: johnjim0816@gmail.com Date: 2021-03-12 21:14:12 LastEditor: John -LastEditTime: 2021-03-23 16:35:46 +LastEditTime: 2021-03-24 22:15:00 Discription: Environment: ''' @@ -14,16 +14,16 @@ import torch.nn.functional as F from torch.distributions import Categorical -class MLP2(nn.Module): - def __init__(self, n_states,n_actions,hidden_dim=128): +class MLP(nn.Module): + def __init__(self, state_dim,action_dim,hidden_dim=128): """ 初始化q网络,为全连接网络 - n_states: 输入的feature即环境的state数目 - n_actions: 输出的action总个数 + state_dim: 输入的feature即环境的state数目 + action_dim: 输出的action总个数 """ - super(MLP2, self).__init__() - self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 + super(MLP, self).__init__() + self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层 self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 - self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 + self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层 def forward(self, x): # 各层对应的激活函数 @@ -32,10 +32,10 @@ def forward(self, x): return self.fc3(x) class Critic(nn.Module): - def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): + def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3): super(Critic, self).__init__() - self.linear1 = nn.Linear(n_obs + n_actions, hidden_size) + self.linear1 = nn.Linear(n_obs + action_dim, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear3 = nn.Linear(hidden_size, 1) # 随机初始化为较小的值 @@ -51,11 +51,11 @@ def forward(self, state, action): return x class Actor(nn.Module): - def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): + def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3): super(Actor, self).__init__() self.linear1 = nn.Linear(n_obs, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, n_actions) + self.linear3 = nn.Linear(hidden_size, action_dim) self.linear3.weight.data.uniform_(-init_w, init_w) self.linear3.bias.data.uniform_(-init_w, init_w) @@ -67,18 +67,18 @@ def forward(self, x): return x class ActorCritic(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim=256): + def __init__(self, state_dim, action_dim, hidden_dim=256): super(ActorCritic, self).__init__() self.critic = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, 1) ) self.actor = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), - nn.Linear(hidden_dim, n_actions), + nn.Linear(hidden_dim, action_dim), nn.Softmax(dim=1), ) diff --git a/codes/envs/blackjack.py b/codes/envs/blackjack.py index 87f02d2d..69468952 100644 --- a/codes/envs/blackjack.py +++ b/codes/envs/blackjack.py @@ -77,7 +77,7 @@ def __init__(self, natural=False): self.natural = natural # Start the first game self._reset() # Number of - self.n_actions = 2 + self.action_dim = 2 def reset(self): return self._reset() diff --git a/codes/envs/cliff_walking.py b/codes/envs/cliff_walking.py index 05b9b2ee..73e33c73 100644 --- a/codes/envs/cliff_walking.py +++ b/codes/envs/cliff_walking.py @@ -31,7 +31,7 @@ def __init__(self): self.shape = (4, 12) nS = np.prod(self.shape) - n_actions = 4 + action_dim = 4 # Cliff Location self._cliff = np.zeros(self.shape, dtype=np.bool) @@ -41,7 +41,7 @@ def __init__(self): P = {} for s in range(nS): position = np.unravel_index(s, self.shape) - P[s] = { a : [] for a in range(n_actions) } + P[s] = { a : [] for a in range(action_dim) } P[s][UP] = self._calculate_transition_prob(position, [-1, 0]) P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1]) P[s][DOWN] = self._calculate_transition_prob(position, [1, 0]) @@ -51,7 +51,7 @@ def __init__(self): isd = np.zeros(nS) isd[np.ravel_multi_index((3,0), self.shape)] = 1.0 - super(CliffWalkingEnv, self).__init__(nS, n_actions, P, isd) + super(CliffWalkingEnv, self).__init__(nS, action_dim, P, isd) def render(self, mode='human', close=False): self._render(mode, close) diff --git a/codes/envs/gridworld.py b/codes/envs/gridworld.py index cf3aec29..c4fd512d 100644 --- a/codes/envs/gridworld.py +++ b/codes/envs/gridworld.py @@ -37,7 +37,7 @@ def __init__(self, shape=[4,4]): self.shape = shape nS = np.prod(shape) - n_actions = 4 + action_dim = 4 MAX_Y = shape[0] MAX_X = shape[1] @@ -51,7 +51,7 @@ def __init__(self, shape=[4,4]): y, x = it.multi_index # P[s][a] = (prob, next_state, reward, is_done) - P[s] = {a : [] for a in range(n_actions)} + P[s] = {a : [] for a in range(action_dim)} is_done = lambda s: s == 0 or s == (nS - 1) reward = 0.0 if is_done(s) else -1.0 @@ -82,7 +82,7 @@ def __init__(self, shape=[4,4]): # This should not be used in any model-free learning algorithm self.P = P - super(GridworldEnv, self).__init__(nS, n_actions, P, isd) + super(GridworldEnv, self).__init__(nS, action_dim, P, isd) def _render(self, mode='human', close=False): """ Renders the current gridworld layout diff --git a/codes/envs/stochastic_mdp.py b/codes/envs/stochastic_mdp.py new file mode 100644 index 00000000..5770fa5d --- /dev/null +++ b/codes/envs/stochastic_mdp.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2021-03-24 22:12:19 +LastEditor: John +LastEditTime: 2021-03-26 17:12:43 +Discription: +Environment: +''' +import numpy as np +import random + + +class StochasticMDP: + def __init__(self): + self.end = False + self.curr_state = 2 + self.action_dim = 2 + self.state_dim = 6 + self.p_right = 0.5 + + def reset(self): + self.end = False + self.curr_state = 2 + state = np.zeros(self.state_dim) + state[self.curr_state - 1] = 1. + return state + + def step(self, action): + if self.curr_state != 1: + if action == 1: + if random.random() < self.p_right and self.curr_state < self.state_dim: + self.curr_state += 1 + else: + self.curr_state -= 1 + + if action == 0: + self.curr_state -= 1 + if self.curr_state == self.state_dim: + self.end = True + + state = np.zeros(self.state_dim) + state[self.curr_state - 1] = 1. + + if self.curr_state == 1: + if self.end: + return state, 1.00, True, {} + else: + return state, 1.00/100.00, True, {} + else: + return state, 0.0, False, {} diff --git a/codes/envs/windy_gridworld.py b/codes/envs/windy_gridworld.py index 2a9d4a47..ac9c66ad 100644 --- a/codes/envs/windy_gridworld.py +++ b/codes/envs/windy_gridworld.py @@ -30,7 +30,7 @@ def __init__(self): self.shape = (7, 10) nS = np.prod(self.shape) - n_actions = 4 + action_dim = 4 # Wind strength winds = np.zeros(self.shape) @@ -41,7 +41,7 @@ def __init__(self): P = {} for s in range(nS): position = np.unravel_index(s, self.shape) - P[s] = { a : [] for a in range(n_actions) } + P[s] = { a : [] for a in range(action_dim) } P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds) P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds) P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds) @@ -51,7 +51,7 @@ def __init__(self): isd = np.zeros(nS) isd[np.ravel_multi_index((3,0), self.shape)] = 1.0 - super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd) + super(WindyGridworldEnv, self).__init__(nS, action_dim, P, isd) def render(self, mode='human', close=False): self._render(mode, close) diff --git a/docs/README.md b/docs/README.md index b4a4ce9c..d62260a5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -30,23 +30,7 @@ | [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) | | | ## 算法实战 -| 算法名称 | 相关论文材料 | 环境 | 备注 | -| :--------------------------------------: | :---------------------------------------------------------: | ------------------------------------- | :--------------------------------: | -| [On-Policy First-Visit MC](../codes/MonteCarlo) | | [Racetrack](../codes/envs/racetrack_env.md) | | -| [Q-Learning](../codes/QLearning) | | [CliffWalking-v0](../codes/envs/gym_info.md) | | -| [Sarsa](../codes/Sarsa) | | [Racetrack](../codes/envs/racetrack_env.md) | | -| [DQN](../codes/DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](../codes/envs/gym_info.md) | | -| DQN-cnn | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](../codes/envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 | -| [DoubleDQN](../codes/DoubleDQN) | | [CartPole-v0](../codes/envs/gym_info.md) | 效果不好,待改进 | -| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | | -| [PolicyGradient](../codes/PolicyGradient) | | [CartPole-v0](../codes/envs/gym_info.md) | | -| A2C | | [CartPole-v0](../codes/envs/gym_info.md) | | -| A3C | | | | -| SAC | | | | -| [PPO](../codes/PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](../codes/envs/gym_info.md) | | -| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](../codes/envs/gym_info.md) | | -| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | | -| GAIL | | | | +[点击](../codes)或者跳转```codes```文件夹下进入算法实战 ## 贡献者