-
Notifications
You must be signed in to change notification settings - Fork 2
/
Qlearning.py
75 lines (64 loc) · 2.13 KB
/
Qlearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from yuanyang_env2 import YuanYangEnv
import random
import time
import numpy as np
class Qlearning(object):
def __init__(self, env):
self.gamma = 0.9
self.lr = 0.5
self.env = env
self.epislon = 1
self.Q = np.zeros((len(self.env.states), len(self.env.actions)), dtype=np.float32)
def td_learning(self):
for i in range(300):
s = self.env.reset()
step_n = 0
while 1:
a = self.e_greedy_policy(s)
s_, r, t = self.env.step(a)
if t: # 即状态s_为终止状态,Q[T]=0
td_error = r - self.Q[s, a]
else:
td_error = r + self.gamma*np.max(self.Q[s_]) - self.Q[s, a]
# td_error = r + self.gamma*np.max(self.Q[s_]) - self.Q[s, a]
self.Q[s, a] += self.lr * td_error
step_n += 1
if t or step_n > 40:
break
s = s_
# 训练时,使用soft-greedy策略
def e_greedy_policy(self, s):
a_max = np.argmax(self.Q[s])
if np.random.uniform() < 1 - self.epislon:
return self.env.actions[a_max]
else:
return self.env.actions[int(random.random() * len(self.env.actions))]
# 训练结束后,使用贪婪策略试验
def greedy_policy(self, s):
a_max = np.argmax(self.Q[s])
return self.env.actions[a_max]
if __name__ == '__main__':
env = YuanYangEnv()
agent = Qlearning(env)
agent.td_learning()
print(np.sum(agent.Q))
flag = 1
s = 0
step_num = 0
# 将最优路径打印出来
while flag:
a = agent.greedy_policy(s)
print('%d->%s\t' % (s, a))
env.bird_male_position = env.state_to_position(s)
env.render()
time.sleep(0.2)
step_num += 1
env.state = s
s_, r, t = env.step(a)
if t == True or step_num > 30:
flag = 0
s = s_
a = np.array([[0, 1, 2, 3], [3, 4, 6, 5]])
b = a + 1
print(np.max(a[1]))
print(b)