forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtd0_prediction.py
executable file
·91 lines (72 loc) · 2.06 KB
/
td0_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
import matplotlib.pyplot as plt
from grid_world import standard_grid, negative_grid
from iterative_policy_evaluation import print_values, print_policy
SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALPHA = 0.1
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')
def epsilon_greedy(policy, s, eps=0.1):
# we'll use epsilon-soft to ensure all states are visited
# what happens if you don't do this? i.e. eps=0
p = np.random.random()
if p < (1 - eps):
return policy[s]
else:
return np.random.choice(ALL_POSSIBLE_ACTIONS)
if __name__ == '__main__':
# use the standard grid again (0 for every step) so that we can compare
# to iterative policy evaluation
grid = standard_grid()
# print rewards
print("rewards:")
print_values(grid.rewards, grid)
# state -> action
policy = {
(2, 0): 'U',
(1, 0): 'U',
(0, 0): 'R',
(0, 1): 'R',
(0, 2): 'R',
(1, 2): 'R',
(2, 1): 'R',
(2, 2): 'R',
(2, 3): 'U',
}
# initialize V(s) and returns
V = {}
states = grid.all_states()
for s in states:
V[s] = 0
# store max change in V(s) per episode
deltas = []
# repeat until convergence
n_episodes = 10000
for it in range(n_episodes):
# begin a new episode
s = grid.reset()
delta = 0
while not grid.game_over():
a = epsilon_greedy(policy, s)
r = grid.move(a)
s_next = grid.current_state()
# update V(s)
v_old = V[s]
V[s] = V[s] + ALPHA*(r + GAMMA*V[s_next] - V[s])
delta = max(delta, np.abs(V[s] - v_old))
# next state becomes current state
s = s_next
# store delta
deltas.append(delta)
plt.plot(deltas)
plt.show()
print("values:")
print_values(V, grid)
print("policy:")
print_policy(policy, grid)