|
| 1 | +import gym |
| 2 | +import numpy as np |
| 3 | +import torch as T |
| 4 | +from actor_critic import ActorCritic |
| 5 | +from icm import ICM |
| 6 | +from memory import Memory |
| 7 | +from utils import plot_learning_curve |
| 8 | + |
| 9 | + |
| 10 | +def worker(name, input_shape, n_actions, global_agent, global_icm, |
| 11 | + optimizer, icm_optimizer, env_id, n_threads, icm=False): |
| 12 | + T_MAX = 20 |
| 13 | + |
| 14 | + local_agent = ActorCritic(input_shape, n_actions) |
| 15 | + |
| 16 | + if icm: |
| 17 | + local_icm = ICM(input_shape, n_actions) |
| 18 | + algo = 'ICM' |
| 19 | + else: |
| 20 | + intrinsic_reward = T.zeros(1) |
| 21 | + algo = 'A3C' |
| 22 | + |
| 23 | + memory = Memory() |
| 24 | + |
| 25 | + env = gym.make(env_id) |
| 26 | + |
| 27 | + t_steps, max_eps, episode, scores, avg_score = 0, 1000, 0, [], 0 |
| 28 | + |
| 29 | + while episode < max_eps: |
| 30 | + obs = env.reset() |
| 31 | + hx = T.zeros(1, 256) |
| 32 | + score, done, ep_steps = 0, False, 0 |
| 33 | + while not done: |
| 34 | + state = T.tensor([obs], dtype=T.float) |
| 35 | + action, value, log_prob, hx = local_agent(state, hx) |
| 36 | + obs_, reward, done, info = env.step(action) |
| 37 | + t_steps += 1 |
| 38 | + ep_steps += 1 |
| 39 | + score += reward |
| 40 | + reward = 0 # turn off extrinsic rewards |
| 41 | + memory.remember(obs, action, reward, obs_, value, log_prob) |
| 42 | + obs = obs_ |
| 43 | + if ep_steps % T_MAX == 0 or done: |
| 44 | + states, actions, rewards, new_states, values, log_probs = \ |
| 45 | + memory.sample_memory() |
| 46 | + if icm: |
| 47 | + intrinsic_reward, L_I, L_F = \ |
| 48 | + local_icm.calc_loss(states, new_states, actions) |
| 49 | + |
| 50 | + loss = local_agent.calc_loss(obs, hx, done, rewards, values, |
| 51 | + log_probs, intrinsic_reward) |
| 52 | + |
| 53 | + optimizer.zero_grad() |
| 54 | + hx = hx.detach_() |
| 55 | + if icm: |
| 56 | + icm_optimizer.zero_grad() |
| 57 | + (L_I + L_F).backward() |
| 58 | + |
| 59 | + loss.backward() |
| 60 | + T.nn.utils.clip_grad_norm_(local_agent.parameters(), 40) |
| 61 | + |
| 62 | + for local_param, global_param in zip( |
| 63 | + local_agent.parameters(), |
| 64 | + global_agent.parameters()): |
| 65 | + global_param._grad = local_param.grad |
| 66 | + optimizer.step() |
| 67 | + local_agent.load_state_dict(global_agent.state_dict()) |
| 68 | + |
| 69 | + if icm: |
| 70 | + for local_param, global_param in zip( |
| 71 | + local_icm.parameters(), |
| 72 | + global_icm.parameters()): |
| 73 | + global_param._grad = local_param.grad |
| 74 | + icm_optimizer.step() |
| 75 | + local_icm.load_state_dict(global_icm.state_dict()) |
| 76 | + memory.clear_memory() |
| 77 | + |
| 78 | + if name == '1': |
| 79 | + scores.append(score) |
| 80 | + avg_score = np.mean(scores[-100:]) |
| 81 | + print('{} episode {} thread {} of {} steps {:.2f}M score {:.2f} ' |
| 82 | + 'intrinsic_reward {:.2f} avg score (100) {:.1f}'.format( |
| 83 | + algo, episode, name, n_threads, |
| 84 | + t_steps/1e6, score, |
| 85 | + T.sum(intrinsic_reward), |
| 86 | + avg_score)) |
| 87 | + episode += 1 |
| 88 | + if name == '1': |
| 89 | + x = [z for z in range(episode)] |
| 90 | + fname = algo + '_CartPole_no_rewards.png' |
| 91 | + plot_learning_curve(x, scores, fname) |
0 commit comments