forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathucb1_starter.py
81 lines (60 loc) · 2.06 KB
/
ucb1_starter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
# https://books.google.ca/books?id=_ATpBwAAQBAJ&lpg=PA201&ots=rinZM8jQ6s&dq=hoeffding%20bound%20gives%20probability%20%22greater%20than%201%22&pg=PA201#v=onepage&q&f=false
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
import matplotlib.pyplot as plt
NUM_TRIALS = 100000
EPS = 0.1
BANDIT_PROBABILITIES = [0.2, 0.5, 0.75]
class Bandit:
def __init__(self, p):
# p: the win rate
self.p = p
self.p_estimate = 0.
self.N = 0. # num samples collected so far
def pull(self):
# draw a 1 with probability p
return np.random.random() < self.p
def update(self, x):
self.N += 1.
self.p_estimate = ((self.N - 1)*self.p_estimate + x) / self.N
def ucb(mean, n, nj):
return # TODO
def run_experiment():
bandits = [Bandit(p) for p in BANDIT_PROBABILITIES]
rewards = np.empty(NUM_TRIALS)
total_plays = 0
# initialization: play each bandit once
for j in range(len(bandits)):
x = bandits[j].pull()
total_plays += 1
bandits[j].update(x)
for i in range(NUM_TRIALS):
j = # TODO
x = bandits[j].pull()
total_plays += 1
bandits[j].update(x)
# for the plot
rewards[i] = x
cumulative_average = np.cumsum(rewards) / (np.arange(NUM_TRIALS) + 1)
# plot moving average ctr
plt.plot(cumulative_average)
plt.plot(np.ones(NUM_TRIALS)*np.max(BANDIT_PROBABILITIES))
plt.xscale('log')
plt.show()
# plot moving average ctr linear
plt.plot(cumulative_average)
plt.plot(np.ones(NUM_TRIALS)*np.max(BANDIT_PROBABILITIES))
plt.show()
for b in bandits:
print(b.p_estimate)
print("total reward earned:", rewards.sum())
print("overall win rate:", rewards.sum() / NUM_TRIALS)
print("num times selected each bandit:", [b.N for b in bandits])
return cumulative_average
if __name__ == '__main__':
run_experiment()