forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathiterative_policy_evaluation_probabilistic.py
112 lines (88 loc) · 3.08 KB
/
iterative_policy_evaluation_probabilistic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
from grid_world import windy_grid, ACTION_SPACE
SMALL_ENOUGH = 1e-3 # threshold for convergence
def print_values(V, g):
for i in range(g.rows):
print("---------------------------")
for j in range(g.cols):
v = V.get((i,j), 0)
if v >= 0:
print(" %.2f|" % v, end="")
else:
print("%.2f|" % v, end="") # -ve sign takes up an extra space
print("")
def print_policy(P, g):
for i in range(g.rows):
print("---------------------------")
for j in range(g.cols):
a = P.get((i,j), ' ')
print(" %s |" % a, end="")
print("")
if __name__ == '__main__':
### define transition probabilities and grid ###
# the key is (s, a, s'), the value is the probability
# that is, transition_probs[(s, a, s')] = p(s' | s, a)
# any key NOT present will considered to be impossible (i.e. probability 0)
# we can take this from the grid object and convert it to the format we want
transition_probs = {}
# to reduce the dimensionality of the dictionary, we'll use deterministic
# rewards, r(s, a, s')
# note: you could make it simpler by using r(s') since the reward doesn't
# actually depend on (s, a)
rewards = {}
grid = windy_grid()
for (s, a), v in grid.probs.items():
for s2, p in v.items():
transition_probs[(s, a, s2)] = p
rewards[(s, a, s2)] = grid.rewards.get(s2, 0)
### probabilistic policy ###
policy = {
(2, 0): {'U': 0.5, 'R': 0.5},
(1, 0): {'U': 1.0},
(0, 0): {'R': 1.0},
(0, 1): {'R': 1.0},
(0, 2): {'R': 1.0},
(1, 2): {'U': 1.0},
(2, 1): {'R': 1.0},
(2, 2): {'U': 1.0},
(2, 3): {'L': 1.0},
}
print_policy(policy, grid)
# initialize V(s) = 0
V = {}
for s in grid.all_states():
V[s] = 0
gamma = 0.9 # discount factor
# repeat until convergence
it = 0
while True:
biggest_change = 0
for s in grid.all_states():
if not grid.is_terminal(s):
old_v = V[s]
new_v = 0 # we will accumulate the answer
for a in ACTION_SPACE:
for s2 in grid.all_states():
# action probability is deterministic
action_prob = policy[s].get(a, 0)
# reward is a function of (s, a, s'), 0 if not specified
r = rewards.get((s, a, s2), 0)
new_v += action_prob * transition_probs.get((s, a, s2), 0) * (r + gamma * V[s2])
# after done getting the new value, update the value table
V[s] = new_v
biggest_change = max(biggest_change, np.abs(old_v - V[s]))
print("iter:", it, "biggest_change:", biggest_change)
print_values(V, grid)
it += 1
if biggest_change < SMALL_ENOUGH:
break
print("V:", V)
print("\n\n")
# sanity check
# at state (1, 2), value is 0.5 * 0.9 * 1 + 0.5 * (-1) = -0.05