-
Notifications
You must be signed in to change notification settings - Fork 0
/
collector.py
107 lines (93 loc) · 3.84 KB
/
collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import bandit
class Collector():
def cuda_if(self, t_obj):
if torch.cuda.is_available():
t_obj = t_obj.cuda()
return t_obj
def __init__(self, net, n_envs, n_bandits, bandit_prob, bootstrap=True):
self.net = net # PyTorch Module
self.pi_space = n_bandits
self.prob = bandit_prob
self.n_envs = n_envs
self.softmax = bandit.Bandit().softmax
self.bootstrap = bootstrap
def rollout(self, n_tsteps):
self.envs = self.new_envs() # Vector of bandit envs
data = {'actions':[], 'sparse_actions':[], 'rewards':[], 'values':[]}
self.net.reset_state(len(self.envs))
self.net.train(mode=False)
self.net.req_grads(False)
net_input = Variable(self.cuda_if(torch.zeros(len(self.envs),self.pi_space+1)))
net_inputs = self.cuda_if(torch.zeros(n_tsteps,len(self.envs),self.pi_space+1))
for i in range(0, n_tsteps):
net_inputs[i] = net_input.data
outputs, vals = self.net.forward(net_input)
pis = self.softmax(outputs.data.cpu().numpy())
actions = self.get_actions(pis)
rewards = self.get_rewards(actions)
data['sparse_actions'].append(np.argmax(actions, axis=-1))
data['actions'].append(actions)
data['rewards'].append(rewards)
vals = vals.data.squeeze().cpu()
data['values'].append(vals.numpy())
net_input = self.get_net_input(actions, rewards)
if self.bootstrap:
outputs, vals = self.net.forward(net_input)
vals = vals.data.squeeze().cpu()
data['values'].append(vals.numpy())
else:
data['values'].append(np.zeros(vals.shape))
for key in data.keys():
data[key] = np.asarray(data[key], dtype=np.float32)
data['rewards'] = data['rewards'].squeeze()
data['net_inputs'] = net_inputs
return data
def new_envs(self):
"""
Makes a new list of bandit environments.
"""
envs = []
for i in range(self.n_envs):
rand = np.random.random()
probs = [self.prob, 1-self.prob] if rand <= 0.5 else [1-self.prob, self.prob]
envs.append(bandit.Bandit(probs=probs))
return envs
def get_actions(self, pis):
"""
pis - ndarray of action probabilities shape (batch_size, n_bandits)
returns:
actions - ndarray of one hot selected actions, shape = (batch_size, n_bandits)
"""
if len(pis.shape) < 2: pis = pis[None]
cumsums = np.zeros((pis.shape[0],))
randoms = np.random.random((pis.shape[0],))
actions = np.zeros(pis.shape)
for i in range(pis.shape[1]):
cumsums += pis[:,i]
actions[:,i] = (randoms < cumsums)
randoms += actions[:,i]
actions = actions.astype(np.float32)
return actions
def get_rewards(self, actions):
"""
actions - ndarray of one_hot actions. shape (batch_size, n_bandits)
returns:
rewards - ndarray of collected rewards. shape = (batch_size, 1)
"""
rewards = []
for action,env in zip(actions,self.envs):
rewards.append(env.pull_lever(action))
return np.asarray(rewards, dtype=np.float32)[..., None]
def get_net_input(self, actions, rewards):
"""
actions - ndarray of one_hot actions. shape (batch_size, n_bandits)
rewards - ndarray of collected rewards with shape (batch_size, 1)
returns:
Variable torch FloatTensor of shape (batch_size, n_bandits+1)
"""
cats = np.concatenate([actions, rewards], axis=-1)
return Variable(self.cuda_if(torch.FloatTensor(cats)))