Skip to content

Commit b627774

Browse files
committed
added lunar lander agent, ac & pg
1 parent ffde0ba commit b627774

File tree

3 files changed

+146
-0
lines changed

3 files changed

+146
-0
lines changed

policy/agent.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import numpy as np
2+
import torch
3+
from policy.networks import ActorCritic
24

35

46
class BlackJackAgent:
@@ -122,3 +124,75 @@ def update(self, state, action, reward, state_):
122124

123125
def decrease_eps(self):
124126
self.epsilon = max(0.01, self.epsilon - 1e-5)
127+
128+
129+
class PolicyGradientAgent:
130+
def __init__(self, input_dim, action_dim, hidden_dim, gamma, lr):
131+
self.gamma = gamma
132+
self.policy = ActorCritic(*input_dim, action_dim, hidden_dim)
133+
self.optimizer = torch.optim.Adam(self.policy.parameters(), lr)
134+
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
135+
self.reward_history, self.action_logprob_history = [], []
136+
137+
def choose_action(self, state):
138+
state = torch.from_numpy(state).to(self.device)
139+
action_proba = torch.softmax(self.policy(state), dim=-1)
140+
action_dist = torch.distributions.Categorical(action_proba)
141+
action = action_dist.sample()
142+
if self.policy.training:
143+
log_probas = action_dist.log_prob(action)
144+
self.action_logprob_history.append(log_probas)
145+
return action.item()
146+
147+
def store_reward(self, reward):
148+
self.reward_history.append(reward)
149+
150+
def update(self):
151+
# calculate MC returns & loss
152+
T = len(self.reward_history)
153+
discounts = torch.logspace(0, T, steps=T + 1, base=self.gamma, device=self.device)[:T]
154+
returns = torch.tensor([torch.tensor(
155+
self.reward_history[t:], dtype=torch.float, device=self.device) @ discounts[t:] for t in range(T)])
156+
loss = 0
157+
for g, log_prob in zip(returns, self.action_logprob_history):
158+
loss += - g * log_prob
159+
160+
# sgd + reset history
161+
self.optimizer.zero_grad()
162+
loss.backward()
163+
self.optimizer.step()
164+
self.reward_history, self.action_logprob_history = [], []
165+
166+
167+
class ActorCriticAgent:
168+
def __init__(self, input_dim, action_dim, hidden_dim, gamma, lr):
169+
self.gamma = gamma
170+
self.actor_critic = ActorCritic(*input_dim, action_dim, hidden_dim)
171+
self.optimizer = torch.optim.Adam(self.actor_critic.parameters(), lr)
172+
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
173+
self.log_proba, self.value = None, None
174+
175+
176+
def choose_action(self, state):
177+
state = torch.from_numpy(state).to(self.device)
178+
self.value, action_logits = self.actor_critic(state)
179+
action_proba = torch.softmax(action_logits, dim=-1)
180+
action_dist = torch.distributions.Categorical(action_proba)
181+
action = action_dist.sample()
182+
self.log_proba = action_dist.log_prob(action)
183+
return action.item()
184+
185+
def update(self, reward, state_, done):
186+
# calculate TD loss
187+
state_ = torch.from_numpy(state_).unsqueeze(0).to(self.device)
188+
value_, _ = self.actor_critic(state_)
189+
critic_loss = (reward + self.gamma * value_ * ~done - self.value).pow(2)
190+
191+
# actor loss
192+
actor_loss = - self.value.detach() * self.log_proba
193+
194+
# sgd + reset history
195+
loss = critic_loss + actor_loss
196+
self.optimizer.zero_grad()
197+
loss.backward()
198+
self.optimizer.step()

policy/lunarlander/main.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import gym
2+
import argparse
3+
import torch
4+
import numpy as np
5+
from tqdm import tqdm
6+
from collections import deque
7+
from policy import agent as Agent
8+
9+
10+
parser = argparse.ArgumentParser(description='Lunar Lander Agents')
11+
parser.add_argument('--agent', type=str, default='Actor Critic', help='Agent style')
12+
parser.add_argument('--n_episodes', type=int, default=3000, help='Number of episodes you wish to run for')
13+
parser.add_argument('--hidden_dim', type=int, default=2048, help='Hidden dimension of FC layers')
14+
parser.add_argument('--lr', '--learning_rate', type=float, default=1e-4, help='Learning rate for Adam optimizer')
15+
parser.add_argument('--gamma', type=float, default=0.99, help='Reward discount factor')
16+
17+
parser.add_argument('--render', action="store_true", default=False, help='Render environment while training')
18+
parser.add_argument('--window_legnth', type=int, default=100, help='Length of window to keep track scores')
19+
args = parser.parse_args()
20+
21+
22+
def main():
23+
env = gym.make('LunarLander-v2')
24+
agent_ = getattr(Agent, args.agent.replace(' ', '') + 'Agent')
25+
agent = agent_(input_dim=env.observation_space.shape,
26+
action_dim=env.action_space.n,
27+
hidden_dim=args.hidden_dim,
28+
gamma=args.gamma,
29+
lr=args.lr)
30+
pbar = tqdm(range(args.n_episodes))
31+
score_history = deque(maxlen=args.window_legnth)
32+
for e in pbar:
33+
done, score, observation = False, 0, env.reset()
34+
while not done:
35+
if args.render:
36+
env.render()
37+
action = agent.choose_action(observation)
38+
next_observation, reward, done, _ = env.step(action)
39+
if args.agent == 'Actor Critic':
40+
agent.update(reward, next_observation, done)
41+
else:
42+
agent.store_reward(reward)
43+
observation = next_observation
44+
score += reward
45+
if args.agent == 'Policy Gradient':
46+
agent.update()
47+
score_history.append(score)
48+
tqdm.write(
49+
f'Episode: {e + 1}/{args.n_episodes}, Score: {score}, Average Score: {np.mean(score_history)}')
50+
51+
52+
if __name__ == '__main__':
53+
main()

policy/networks.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import torch
2+
from torch import nn
3+
4+
5+
class ActorCritic(nn.Module):
6+
def __init__(self, input_dim, n_actions, hidden_dim):
7+
super().__init__()
8+
self.encoder = nn.Sequential(
9+
nn.Linear(input_dim, hidden_dim),
10+
nn.ReLU(True),
11+
nn.Linear(hidden_dim, hidden_dim // 2),
12+
nn.ReLU(True)
13+
)
14+
self.v = nn.Linear(hidden_dim // 2, 1)
15+
self.pi = nn.Linear(hidden_dim // 2, n_actions)
16+
17+
def forward(self, state):
18+
features = self.encoder(state)
19+
return self.v(features), self.pi(features)

0 commit comments

Comments
 (0)