1+ import numpy as np
2+ from utils .epsilon import standard_epsilon , decaying_epsilon , exponential_decay_epsilon
3+ import matplotlib .pyplot as plt
4+
5+ class bandit :
6+ def __init__ (self , win_rate ):
7+ self .win_rate = win_rate
8+ self .n = 1
9+ self .sample_mean = 5 #Optimistic Initial Value
10+
11+ def pull (self ):
12+ return np .random .random () < self .win_rate
13+
14+ def update (self , reward ):
15+ self .n += 1
16+ learning_rate = 1 / self .n
17+ self .sample_mean = self .sample_mean + learning_rate * (reward - self .sample_mean )
18+
19+ if __name__ == '__main__' :
20+ n_iter = 10000
21+ win_rate = [0.25 , 0.35 , 0.45 ]
22+
23+
24+ optimal_idx = np .argmax (win_rate )
25+ bandit_list = [bandit (rate ) for rate in win_rate ]
26+ reward_list = []
27+ n_optimal = 0
28+ for t in range (n_iter ):
29+ idx = np .argmax ([bandit .sample_mean for bandit in bandit_list ]) #Always Exploit, no explore
30+ if idx == optimal_idx :
31+ n_optimal += 1
32+ reward = int (bandit_list [idx ].pull ())
33+ reward_list .append (reward )
34+ bandit_list [idx ].update (reward )
35+
36+ for b in bandit_list :
37+ print (f"mean estimate is { b .sample_mean } " )
38+ print ("total reward earned:" , np .sum (reward_list ))
39+ print ("overall win rate:" , np .sum (reward_list ) / n_iter )
40+ print ("num times selected optimal bandit:" , n_optimal )
41+
42+ cumulative_rewards = np .cumsum (reward_list )
43+ win_rates = cumulative_rewards / (np .arange (n_iter ) + 1 )
44+ plt .plot (win_rates )
45+ plt .show ()
0 commit comments