Skip to content

Commit 79fcf3e

Browse files
committed
Create bandit_OIV.py
1 parent 3ac5e05 commit 79fcf3e

File tree

1 file changed

+45
-0
lines changed

1 file changed

+45
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import numpy as np
2+
from utils.epsilon import standard_epsilon, decaying_epsilon, exponential_decay_epsilon
3+
import matplotlib.pyplot as plt
4+
5+
class bandit:
6+
def __init__(self, win_rate):
7+
self.win_rate = win_rate
8+
self.n = 1
9+
self.sample_mean = 5 #Optimistic Initial Value
10+
11+
def pull(self):
12+
return np.random.random() < self.win_rate
13+
14+
def update(self, reward):
15+
self.n += 1
16+
learning_rate = 1 / self.n
17+
self.sample_mean = self.sample_mean + learning_rate * (reward - self.sample_mean)
18+
19+
if __name__ == '__main__':
20+
n_iter = 10000
21+
win_rate = [0.25, 0.35, 0.45]
22+
23+
24+
optimal_idx = np.argmax(win_rate)
25+
bandit_list = [bandit(rate) for rate in win_rate]
26+
reward_list = []
27+
n_optimal = 0
28+
for t in range(n_iter):
29+
idx = np.argmax([bandit.sample_mean for bandit in bandit_list]) #Always Exploit, no explore
30+
if idx == optimal_idx:
31+
n_optimal += 1
32+
reward = int(bandit_list[idx].pull())
33+
reward_list.append(reward)
34+
bandit_list[idx].update(reward)
35+
36+
for b in bandit_list:
37+
print(f"mean estimate is {b.sample_mean}")
38+
print("total reward earned:", np.sum(reward_list))
39+
print("overall win rate:", np.sum(reward_list) / n_iter)
40+
print("num times selected optimal bandit:", n_optimal)
41+
42+
cumulative_rewards = np.cumsum(reward_list)
43+
win_rates = cumulative_rewards / (np.arange(n_iter) + 1)
44+
plt.plot(win_rates)
45+
plt.show()

0 commit comments

Comments
 (0)