Skip to content

Commit d59378f

Browse files
committed
Create bandit_ucb.py
1 parent 79fcf3e commit d59378f

File tree

1 file changed

+50
-0
lines changed

1 file changed

+50
-0
lines changed
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import numpy as np
2+
from utils.epsilon import standard_epsilon, decaying_epsilon, exponential_decay_epsilon
3+
import matplotlib.pyplot as plt
4+
5+
class bandit:
6+
def __init__(self, win_rate):
7+
self.win_rate = win_rate
8+
self.n = 1
9+
self.sample_mean = 0
10+
11+
def pull(self):
12+
return np.random.random() < self.win_rate
13+
14+
def update(self, reward):
15+
self.n += 1
16+
learning_rate = 1 / self.n
17+
self.sample_mean = self.sample_mean + learning_rate * (reward - self.sample_mean)
18+
19+
def get_ucb(mean, N, nj):
20+
return mean + np.sqrt((2 * np.log(N)) / nj)
21+
22+
if __name__ == '__main__':
23+
n_iter = 100000
24+
win_rate = [0.25, 0.35, 0.45]
25+
26+
optimal_idx = np.argmax(win_rate)
27+
bandit_list = [bandit(rate) for rate in win_rate]
28+
reward_list = []
29+
n_optimal = 0
30+
N = 0
31+
for t in range(n_iter):
32+
N += 1
33+
ucb_list = [get_ucb(bandit.sample_mean, N, bandit.n) for bandit in bandit_list]
34+
idx = np.argmax(ucb_list) #Always Exploit, no explore
35+
if idx == optimal_idx:
36+
n_optimal += 1
37+
reward = int(bandit_list[idx].pull())
38+
reward_list.append(reward)
39+
bandit_list[idx].update(reward)
40+
41+
for b in bandit_list:
42+
print(f"mean estimate is {b.sample_mean}")
43+
print("total reward earned:", np.sum(reward_list))
44+
print("overall win rate:", np.sum(reward_list) / n_iter)
45+
print("num times selected optimal bandit:", n_optimal)
46+
47+
cumulative_rewards = np.cumsum(reward_list)
48+
win_rates = cumulative_rewards / (np.arange(n_iter) + 1)
49+
plt.plot(win_rates)
50+
plt.show()

0 commit comments

Comments
 (0)