-
Notifications
You must be signed in to change notification settings - Fork 1
/
mc_control.py
47 lines (35 loc) · 1.53 KB
/
mc_control.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from state import State
from env import step
from collections import defaultdict
from random import randint, random
from common import action_value_to_value_function, plot_value_function, epsilon_greedy_policy, save
from progressbar import ProgressBar
def monte_carlo_control():
action_value_function = defaultdict(float)
n_s = defaultdict(int)
n_s_a = defaultdict(int)
n_zero = 1E5
episodes = xrange(int(1E8))
pbar = ProgressBar(maxval=len(episodes)).start()
for episode in episodes:
state = State()
while not state.terminal:
player = state.player
dealer = state.dealer
epsilon = float(n_zero) / (n_zero + n_s[(dealer, player)])
action = epsilon_greedy_policy(action_value_function, state, epsilon)
n_s[(dealer, player)] += 1
n_s_a[(dealer, player, action)] += 1
reward = step(state, action)
# update the action value function
alpha = 1.0 / n_s_a[(dealer, player, action)]
new_reward = action_value_function[(dealer, player, action)]
action_value_function[(dealer, player, action)] += alpha * (reward - new_reward)
pbar.update(episode)
pbar.finish()
value_function = action_value_to_value_function(action_value_function)
plot_value_function(value_function, "Optimal Value Function: Question 2")
return action_value_function
if __name__ == '__main__':
mc_action_value_function = monte_carlo_control()
save(mc_action_value_function, "mc_result.dat")