-
Notifications
You must be signed in to change notification settings - Fork 2
/
Qlearning(lamda).py
67 lines (60 loc) · 2.05 KB
/
Qlearning(lamda).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from yuanyang_env2 import YuanYangEnv
import random
import time
import numpy as np
class Qlearning(object):
def __init__(self, env):
self.gamma = 0.9
self.lamda = 0.5
self.lr = 0.1
self.env = env
self.epislon = 1
self.Q = np.zeros((len(self.env.states), len(self.env.actions)), dtype=np.float32)
def td_learning(self):
for i in range(5000):
self.E = np.zeros((len(self.env.states), len(self.env.actions)), dtype=np.float32)
s = self.env.reset()
step_n = 0
while 1:
a = self.e_greedy_policy(s)
s_, r, t = self.env.step(a)
td_error = r + self.gamma*np.max(self.Q[s_]) - self.Q[s, a]
self.E *= self.lamda
self.E[s, a] += 1.0
self.Q += self.lr * self.E * td_error # 对所有行为值函数进行更新
step_n += 1
if t or step_n > 40:
break
s = s_
# 训练时,使用soft-greedy策略
def e_greedy_policy(self, s):
a_max = np.argmax(self.Q[s])
if np.random.uniform() < 1 - self.epislon:
return self.env.actions[a_max]
else:
return self.env.actions[int(random.random() * len(self.env.actions))]
# 训练结束后,使用贪婪策略试验
def greedy_policy(self, s):
a_max = np.argmax(self.Q[s])
return self.env.actions[a_max]
if __name__ == '__main__':
env = YuanYangEnv()
agent = Qlearning(env)
agent.td_learning()
print(np.sum(agent.Q))
flag = 1
s = 0
step_num = 0
# 将最优路径打印出来
while flag:
a = agent.greedy_policy(s)
print('%d->%s\t' % (s, a))
env.bird_male_position = env.state_to_position(s)
env.render()
time.sleep(0.2)
step_num += 1
env.state = s
s_, r, t = env.step(a)
if t == True or step_num > 20:
flag = 0
s = s_