Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
johnjim0816 committed Sep 26, 2021
1 parent 1e60b68 commit fb2affb
Show file tree
Hide file tree
Showing 18 changed files with 191 additions and 24 deletions.
167 changes: 167 additions & 0 deletions codes/GAE/task0_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
import matplotlib.pyplot as plt
import seaborn as sns
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加父路径到系统路径sys.path

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

from common.multiprocessing_env import SubprocVecEnv

num_envs = 16
env_name = "Pendulum-v0"

def make_env():
def _thunk():
env = gym.make(env_name)
return env

return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)

def init_weights(m):
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, mean=0., std=0.1)
nn.init.constant_(m.bias, 0.1)

class ActorCritic(nn.Module):
def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
super(ActorCritic, self).__init__()

self.critic = nn.Sequential(
nn.Linear(num_inputs, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, 1)
)

self.actor = nn.Sequential(
nn.Linear(num_inputs, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, num_outputs),
)
self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)

self.apply(init_weights)

def forward(self, x):
value = self.critic(x)
mu = self.actor(x)
std = self.log_std.exp().expand_as(mu)
dist = Normal(mu, std)
return dist, value


def plot(frame_idx, rewards):
plt.figure(figsize=(20,5))
plt.subplot(131)
plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
plt.plot(rewards)
plt.show()

def test_env(vis=False):
state = env.reset()
if vis: env.render()
done = False
total_reward = 0
while not done:
state = torch.FloatTensor(state).unsqueeze(0).to(device)
dist, _ = model(state)
next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
state = next_state
if vis: env.render()
total_reward += reward
return total_reward

def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
values = values + [next_value]
gae = 0
returns = []
for step in reversed(range(len(rewards))):
delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
gae = delta + gamma * tau * masks[step] * gae
returns.insert(0, gae + values[step])
return returns

num_inputs = envs.observation_space.shape[0]
num_outputs = envs.action_space.shape[0]

#Hyper params:
hidden_size = 256
lr = 3e-2
num_steps = 20

model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
optimizer = optim.Adam(model.parameters())

max_frames = 100000
frame_idx = 0
test_rewards = []

state = envs.reset()

while frame_idx < max_frames:

log_probs = []
values = []
rewards = []
masks = []
entropy = 0

for _ in range(num_steps):
state = torch.FloatTensor(state).to(device)
dist, value = model(state)

action = dist.sample()
next_state, reward, done, _ = envs.step(action.cpu().numpy())

log_prob = dist.log_prob(action)
entropy += dist.entropy().mean()

log_probs.append(log_prob)
values.append(value)
rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

state = next_state
frame_idx += 1

if frame_idx % 1000 == 0:
test_rewards.append(np.mean([test_env() for _ in range(10)]))
print(test_rewards[-1])
# plot(frame_idx, test_rewards)

next_state = torch.FloatTensor(next_state).to(device)
_, next_value = model(next_state)
returns = compute_gae(next_value, rewards, masks, values)

log_probs = torch.cat(log_probs)
returns = torch.cat(returns).detach()
values = torch.cat(values)

advantage = returns - values

actor_loss = -(log_probs * advantage.detach()).mean()
critic_loss = advantage.pow(2).mean()

loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

optimizer.zero_grad()
loss.backward()
optimizer.step()
8 changes: 3 additions & 5 deletions codes/PPO/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Email: johnjim0816@gmail.com
Date: 2021-03-23 15:17:42
LastEditor: John
LastEditTime: 2021-04-28 10:11:09
LastEditTime: 2021-09-26 22:02:00
Discription:
Environment:
'''
Expand Down Expand Up @@ -41,10 +41,8 @@ def choose_action(self, observation):

def update(self):
for _ in range(self.n_epochs):
state_arr, action_arr, old_prob_arr, vals_arr,\
reward_arr, dones_arr, batches = \
self.memory.sample()
values = vals_arr
state_arr, action_arr, old_prob_arr, vals_arr,reward_arr, dones_arr, batches = self.memory.sample()
values = vals_arr[:]
### compute advantage ###
advantage = np.zeros(len(reward_arr), dtype=np.float32)
for t in range(len(reward_arr)-1):
Expand Down
13 changes: 4 additions & 9 deletions codes/PPO/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Email: johnjim0816@gmail.com
Date: 2021-03-23 15:30:46
LastEditor: John
LastEditTime: 2021-03-23 15:30:55
LastEditTime: 2021-09-26 22:00:07
Discription:
Environment:
'''
Expand All @@ -24,14 +24,9 @@ def sample(self):
indices = np.arange(len(self.states), dtype=np.int64)
np.random.shuffle(indices)
batches = [indices[i:i+self.batch_size] for i in batch_step]
return np.array(self.states),\
np.array(self.actions),\
np.array(self.probs),\
np.array(self.vals),\
np.array(self.rewards),\
np.array(self.dones),\
batches

return np.array(self.states),np.array(self.actions),np.array(self.probs),\
np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches

def push(self, state, action, probs, vals, reward, done):
self.states.append(state)
self.actions.append(action)
Expand Down
7 changes: 4 additions & 3 deletions codes/PPO/task0_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Email: johnjim0816@gmail.com
Date: 2021-03-22 16:18:10
LastEditor: John
LastEditTime: 2021-05-06 00:43:36
LastEditTime: 2021-09-26 22:05:00
Discription:
Environment:
'''
Expand All @@ -17,6 +17,7 @@
import gym
import torch
import datetime
import tqdm
from PPO.agent import PPO
from common.plot import plot_rewards
from common.utils import save_results,make_dir
Expand Down Expand Up @@ -51,7 +52,7 @@ def env_agent_config(cfg,seed=1):
return env,agent

def train(cfg,env,agent):
print('Start to train !')
print('开始训练!')
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
rewards= []
ma_rewards = [] # moving average rewards
Expand All @@ -75,7 +76,7 @@ def train(cfg,env,agent):
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}")
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}")
print('Complete training!')
return rewards,ma_rewards

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 12 additions & 6 deletions codes/QLearning/task0_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00
LastEditor: John
LastEditTime: 2021-09-20 00:32:59
LastEditTime: 2021-09-23 12:22:58
Discription:
Environment:
'''
Expand Down Expand Up @@ -34,7 +34,7 @@ def __init__(self):
self.train_eps = 400 # 训练的回合数
self.eval_eps = 30 # 测试的回合数
self.gamma = 0.9 # reward的衰减率
self.epsilon_start = 0.99 # e-greedy策略中初始epsilon
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率
self.lr = 0.1 # 学习率
Expand All @@ -53,14 +53,15 @@ def env_agent_config(cfg,seed=1):
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = []
ma_rewards = [] # 滑动平均奖励
rewards = [] # 记录奖励
ma_rewards = [] # 记录滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录每个回合的奖励
state = env.reset() # 重置环境,即开始新的回合
while True:
action = agent.choose_action(state) # 根据算法选择一个动作
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
print(reward)
agent.update(state, action, reward, next_state, done) # Q学习算法更新
state = next_state # 更新状态
ep_reward += reward
Expand All @@ -78,6 +79,8 @@ def train(cfg,env,agent):
def eval(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')
for item in agent.Q_table.items():
print(item)
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 滑动平均的奖励
for i_ep in range(cfg.eval_eps):
Expand All @@ -86,7 +89,7 @@ def eval(cfg,env,agent):
while True:
action = agent.predict(state) # 根据算法选择一个动作
next_state, reward, done, _ = env.step(action) # 与环境进行一个交互
state = next_state # 存储上一个观察值
state = next_state # 更新状态
ep_reward += reward
if done:
break
Expand All @@ -103,17 +106,20 @@ def eval(cfg,env,agent):
cfg = QlearningConfig()

# 训练
env,agent = env_agent_config(cfg,seed=1)
env,agent = env_agent_config(cfg,seed=0)
rewards,ma_rewards = train(cfg,env,agent)
make_dir(cfg.result_path,cfg.model_path) # 创建文件夹
agent.save(path=cfg.model_path) # 保存模型
for item in agent.Q_table.items():
print(item)
save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) # 保存结果
plot_rewards_cn(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)

# # 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path) # 加载模型
rewards,ma_rewards = eval(cfg,env,agent)

save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
plot_rewards_cn(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)

Expand Down
2 changes: 1 addition & 1 deletion codes/common/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Email: johnjim0816@gmail.com
Date: 2020-10-07 20:57:11
LastEditor: John
LastEditTime: 2021-09-19 23:00:36
LastEditTime: 2021-09-23 12:23:01
Discription:
Environment:
'''
Expand Down

0 comments on commit fb2affb

Please sign in to comment.