1+ import numpy as np
2+ from utils .epsilon import standard_epsilon , decaying_epsilon , exponential_decay_epsilon
3+ import matplotlib .pyplot as plt
4+
5+ class bandit :
6+ def __init__ (self , win_rate ):
7+ self .win_rate = win_rate
8+ self .n = 1
9+ self .sample_mean = 0
10+
11+ def pull (self ):
12+ return np .random .random () < self .win_rate
13+
14+ def update (self , reward ):
15+ self .n += 1
16+ learning_rate = 1 / self .n
17+ self .sample_mean = self .sample_mean + learning_rate * (reward - self .sample_mean )
18+
19+ def get_ucb (mean , N , nj ):
20+ return mean + np .sqrt ((2 * np .log (N )) / nj )
21+
22+ if __name__ == '__main__' :
23+ n_iter = 100000
24+ win_rate = [0.25 , 0.35 , 0.45 ]
25+
26+ optimal_idx = np .argmax (win_rate )
27+ bandit_list = [bandit (rate ) for rate in win_rate ]
28+ reward_list = []
29+ n_optimal = 0
30+ N = 0
31+ for t in range (n_iter ):
32+ N += 1
33+ ucb_list = [get_ucb (bandit .sample_mean , N , bandit .n ) for bandit in bandit_list ]
34+ idx = np .argmax (ucb_list ) #Always Exploit, no explore
35+ if idx == optimal_idx :
36+ n_optimal += 1
37+ reward = int (bandit_list [idx ].pull ())
38+ reward_list .append (reward )
39+ bandit_list [idx ].update (reward )
40+
41+ for b in bandit_list :
42+ print (f"mean estimate is { b .sample_mean } " )
43+ print ("total reward earned:" , np .sum (reward_list ))
44+ print ("overall win rate:" , np .sum (reward_list ) / n_iter )
45+ print ("num times selected optimal bandit:" , n_optimal )
46+
47+ cumulative_rewards = np .cumsum (reward_list )
48+ win_rates = cumulative_rewards / (np .arange (n_iter ) + 1 )
49+ plt .plot (win_rates )
50+ plt .show ()
0 commit comments