|
4 | 4 | import numpy as np
|
5 | 5 | from tqdm import tqdm
|
6 | 6 | from collections import deque
|
| 7 | +from pathlib import Path |
| 8 | +from torch.utils.tensorboard import SummaryWriter |
| 9 | + |
| 10 | +from policy.utils import clip_action |
7 | 11 | from policy import agent as Agent
|
8 | 12 |
|
9 | 13 |
|
10 | 14 | parser = argparse.ArgumentParser(description='Lunar Lander Agents')
|
11 |
| -parser.add_argument('--agent', type=str, default='Actor Critic', help='Agent style') |
| 15 | +# training hyperparams |
| 16 | +parser.add_argument('--agent', type=str, default='DDPG', help='Agent style') |
12 | 17 | parser.add_argument('--n_episodes', type=int, default=3000, help='Number of episodes you wish to run for')
|
| 18 | +parser.add_argument('--batch_size', type=int, default=64, help='Minibatch size') |
13 | 19 | parser.add_argument('--hidden_dim', type=int, default=2048, help='Hidden dimension of FC layers')
|
14 |
| -parser.add_argument('--lr', '--learning_rate', type=float, default=1e-4, help='Learning rate for Adam optimizer') |
| 20 | +parser.add_argument('--hidden_dims', type=list, default=[400, 300], help='Hidden dimensions of FC layers') |
| 21 | +parser.add_argument('--critic_lr', type=float, default=1e-3, help='Learning rate for Critic') |
| 22 | +parser.add_argument('--critic_wd', type=float, default=1e-2, help='Weight decay for Critic') |
| 23 | +parser.add_argument('--actor_lr', type=float, default=1e-4, help='Learning rate for Actor') |
| 24 | +parser.add_argument('--actor_wd', type=float, default=0., help='Weight decay for Actor') |
15 | 25 | parser.add_argument('--gamma', type=float, default=0.99, help='Reward discount factor')
|
| 26 | +parser.add_argument('--final_init', type=float, default=3e-3, help='The range for output layer initialization') |
| 27 | +parser.add_argument('--tau', type=float, default=0.001, help='Weight of target network update') |
| 28 | +parser.add_argument('--maxsize', type=int, default=1e6, help='Size of Replay Buffer') |
| 29 | +parser.add_argument('--sigma', type=float, default=0.2, help='Sigma for UOnoise') |
| 30 | +parser.add_argument('--theta', type=float, default=0.15, help='Theta for UOnoise') |
| 31 | +parser.add_argument('--dt', type=float, default=1e-2, help='dt for UOnoise') |
16 | 32 |
|
| 33 | +# eval params |
17 | 34 | parser.add_argument('--render', action="store_true", default=False, help='Render environment while training')
|
18 | 35 | parser.add_argument('--window_legnth', type=int, default=100, help='Length of window to keep track scores')
|
| 36 | + |
| 37 | +# checkpoint + logs |
| 38 | +parser.add_argument('--checkpoint', type=str, default='policy/lunarlander/checkpoint', help='Checkpoint for model weights') |
| 39 | +parser.add_argument('--logdir', type=str, default='policy/lunarlander/logs', help='Directory to save logs') |
19 | 40 | args = parser.parse_args()
|
20 | 41 |
|
21 | 42 |
|
22 | 43 | def main():
|
23 |
| - env = gym.make('LunarLander-v2') |
| 44 | + env_type = 'Continuous' if args.agent in ['DDPG'] else '' |
| 45 | + env = gym.make(f'LunarLander{env_type}-v2') |
24 | 46 | agent_ = getattr(Agent, args.agent.replace(' ', '') + 'Agent')
|
25 |
| - agent = agent_(input_dim=env.observation_space.shape, |
26 |
| - action_dim=env.action_space.n, |
27 |
| - hidden_dim=args.hidden_dim, |
28 |
| - gamma=args.gamma, |
29 |
| - lr=args.lr) |
| 47 | + if args.agent in ['DDPG']: |
| 48 | + max_action = float(env.action_space.high[0]) |
| 49 | + agent = agent_(state_dim=env.observation_space.shape, |
| 50 | + action_dim=env.action_space.shape, |
| 51 | + hidden_dims=args.hidden_dims, |
| 52 | + max_action=max_action, |
| 53 | + gamma=args.gamma, |
| 54 | + tau=args.tau, |
| 55 | + critic_lr=args.critic_lr, |
| 56 | + critic_wd=args.critic_wd, |
| 57 | + actor_lr=args.actor_lr, |
| 58 | + actor_wd=args.actor_wd, |
| 59 | + batch_size=args.batch_size, |
| 60 | + final_init=args.final_init, |
| 61 | + maxsize=int(args.maxsize), |
| 62 | + sigma=args.sigma, |
| 63 | + theta=args.theta, |
| 64 | + dt=args.dt, |
| 65 | + checkpoint=args.checkpoint) |
| 66 | + else: |
| 67 | + agent = agent_(state_dim=env.observation_space.shape, |
| 68 | + actionaction_dim_dim=env.action_space.n, |
| 69 | + hidden_dims=args.hidden_dims, |
| 70 | + gamma=args.gamma, |
| 71 | + lr=args.lr) |
| 72 | + |
| 73 | + Path(args.logdir).mkdir(parents=True, exist_ok=True) |
| 74 | + Path(args.checkpoint).mkdir(parents=True, exist_ok=True) |
| 75 | + |
| 76 | + writer = SummaryWriter(args.logdir) |
| 77 | + |
30 | 78 | pbar = tqdm(range(args.n_episodes))
|
31 | 79 | score_history = deque(maxlen=args.window_legnth)
|
| 80 | + best_score = env.reward_range[0] |
32 | 81 | for e in pbar:
|
33 | 82 | done, score, observation = False, 0, env.reset()
|
| 83 | + |
| 84 | + # reset DDPG UO Noise and also keep track of actor/critic losses |
| 85 | + if args.agent in ['DDPG']: |
| 86 | + agent.noise.reset() |
| 87 | + actor_losses, critic_losses = [], [] |
34 | 88 | while not done:
|
35 | 89 | if args.render:
|
36 | 90 | env.render()
|
| 91 | + |
37 | 92 | action = agent.choose_action(observation)
|
| 93 | + # clip noised action to ensure not out of bounds |
| 94 | + if args.agent in ['DDPG']: |
| 95 | + action = clip_action(action, max_action) |
38 | 96 | next_observation, reward, done, _ = env.step(action)
|
| 97 | + score += reward |
| 98 | + |
| 99 | + # update for td methods, recording for mc methods |
39 | 100 | if args.agent == 'Actor Critic':
|
40 | 101 | agent.update(reward, next_observation, done)
|
| 102 | + elif args.agent in ['DDPG']: |
| 103 | + agent.store_transition(observation, action, reward, next_observation, done) |
| 104 | + # if we have memory smaller than batch size, do not update |
| 105 | + if agent.memory.idx < args.batch_size: |
| 106 | + continue |
| 107 | + actor_loss, critic_loss = agent.update() |
| 108 | + actor_losses.append(actor_loss) |
| 109 | + critic_losses.append(critic_loss) |
| 110 | + pbar.set_postfix({'Reward': reward, 'Actor Loss': actor_loss, 'Critic Loss': critic_loss}) |
41 | 111 | else:
|
42 | 112 | agent.store_reward(reward)
|
43 | 113 | observation = next_observation
|
44 |
| - score += reward |
| 114 | + |
| 115 | + score_history.append(score) |
| 116 | + |
| 117 | + # update for mc methods w/ full trajectory |
45 | 118 | if args.agent == 'Policy Gradient':
|
46 | 119 | agent.update()
|
47 |
| - score_history.append(score) |
| 120 | + |
| 121 | + # logging & saving |
| 122 | + elif args.agent in ['DDPG']: |
| 123 | + writer.add_scalars( |
| 124 | + 'Scores', |
| 125 | + {'Episodic': score, 'Windowed Average': np.mean(score_history)}, |
| 126 | + global_step=e) |
| 127 | + if actor_losses: |
| 128 | + writer.add_scalars( |
| 129 | + 'Losses', |
| 130 | + {'Actor': np.mean(actor_losses), 'Critic': np.mean(critic_losses)}, |
| 131 | + global_step=e) |
| 132 | + actor_losses, critic_losses = [], [] |
| 133 | + |
| 134 | + if score > best_score: |
| 135 | + best_score = score |
| 136 | + agent.save_models() |
48 | 137 | tqdm.write(
|
49 | 138 | f'Episode: {e + 1}/{args.n_episodes}, Score: {score}, Average Score: {np.mean(score_history)}')
|
50 | 139 |
|
|
0 commit comments