Skip to content

Commit

Permalink
AC_separate调试
Browse files Browse the repository at this point in the history
  • Loading branch information
Rebel-Uranus committed Mar 3, 2020
1 parent 1c69750 commit a1c8a61
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 34 deletions.
29 changes: 20 additions & 9 deletions Env.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def reset(self):
self.his_accept = []
self.buy_ticket_value = 0
self.his_price = np.zeros((self.his_t, 87))
# self.routeId = np.random.randint(len(self.data))
self.routeId =21
self.routeId = np.random.randint(len(self.data))
# self.routeId =21
self.order_distribution = OrderGenerator(self.data[self.routeId], self.mode)
self.totalReward = 0

Expand Down Expand Up @@ -101,13 +101,19 @@ def SeparateStep(self, accpet_act, buy_act):
order_accept = 1
self.order_left -= 1
reward_accept = self.getAcceptReward(accpet_act)


# print("Accept:",self.today+1)
else:
reward_accept = -1
if self.today >= 86:
if type(buy_act).__name__ == 'list':
buy_act = np.ones_like(buy_act)
else:
buy_act = 1
reward_buy = self.getBuyReward(buy_act, self.orders)
if len(self.orders)==len(buy_act):
reward_buy = self.getBuyReward(buy_act, self.orders)

for i in range(len(buy_act)):
if buy_act[i] == 1 :

self.orders.pop(i)
profit = reward_buy
self.profit += profit
Expand Down Expand Up @@ -149,15 +155,20 @@ def getBuyReward(self, act, orders):
reward = 0
today_price = self.data[self.routeId][self.today]
if type(act).__name__ == 'list':
for i in range(len(orders)):
for i in range(len(act)):
# print("LOOK:", orders[i], today_price)
if act[i] != 0:
reward += orders[i] - today_price
if len(orders) >0:
reward += orders[i] - today_price
else:
reward -= 1
else:
reward += 0
elif act != 0 and len(orders) >= 1:
for order in orders:
reward += order - today_price
elif act != 0 and len(orders) == 0:
reward -= 1
self.totalReward += reward
return reward

Expand All @@ -168,7 +179,7 @@ def getTodayIndex(self):
if self.order_num == self.order_left:
return -1
else:
return self.today
return self.today+1



Expand Down
18 changes: 11 additions & 7 deletions OrderGenerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,21 @@ def OrderGenerator(data, mode):
order_list = []
order_price = -1
if mode == 1:
for i in range(1,87):
for i in range(87):
have_order = np.random.randint(0, 3)
if have_order == 0:
order_price = np.min(np.hstack((np.average(data[:i]), data[i]))) - np.random.randint(0, 50)
order_price = np.min(np.hstack((np.average(data[:i+1]), data[i]))) - np.random.randint(0, 50)
# order_price = 1877
else:
order_price = -1
order_list.append(order_price)


elif mode == 2:
for i in range(1,87):
for i in range(87):
have_order = np.random.randint(0, 3)
if have_order == 0:
order_price = np.min(np.hstack((np.average(data[:i]), data[i]))) * ((0.95 - 0.7) * np.random.sample() + 0.7)
order_price = np.min(np.hstack((np.average(data[:i+1]), data[i]))) * ((0.95 - 0.7) * np.random.sample() + 0.7)
else:
order_price = -1
order_list.append(order_price)
Expand All @@ -52,6 +53,9 @@ def readRoute(filename):
return result

if __name__ == "__main__":
route_list = readRoute("./wang/data/route")
print(len(OrderGenerator(route_list[21],1)))
# print(route_list[21][:0])
# route_list = readRoute("./wang/data/route")
# print(len(OrderGenerator(route_list[21],1)))
# print(route_list[21][:0])
a = [1,2,3,4,5,6,7,8,9]
a = [s**2 for s in a if s>5]
print(a)
34 changes: 20 additions & 14 deletions Train.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self):
self.routeline = []
self.allRoute = readRoute("./wang/data/route")
self.env = Env(self.allRoute, history_take_off=1, order_num=1)
self.epsilon = 0.
self.epsilon = 0.5

def transcate_AC(self):
total_steps = 0 # 记录步数,一天是一步
Expand All @@ -38,7 +38,7 @@ def transcate_AC(self):
prob_clip=0.,
)
gameNum = 0 #记录游戏轮数
ex_steps = 500 #探索衰减的轮数
ex_steps = 100 #探索衰减的轮数
epsilon = self.epsilon
reward_list = [0] #存储每次的收益,来计算baseline
Loss_list = [] #存储训练过程中的损失值
Expand All @@ -61,7 +61,7 @@ def transcate_AC(self):
while terminal == False:
today = self.env.getTodayIndex()
# 当前状态
state_tf = np.mat(state[1][0])
state_tf = state[1][0]
# print(state_tf,len(state_tf))
# 由神经网络选择行动
if random.random()<epsilon and isExploration == False:
Expand All @@ -79,20 +79,24 @@ def transcate_AC(self):
else:
#action from learning
action,p = brain.choose_action(state_tf, today)

tao_prob.append(p)
if today >= 0:
wait_day.append(today)

# 订单字典 历史曲线 reward
next_state,reward,terminal,_ = self.env.SeparateStep(1, [action])
today = self.env.getTodayIndex()
tao_reward.append(reward)
# 订单完成或者到最后一天
state_ = next_state[1][0]
td_error = brain.criticLearn(state_tf, reward[1], state_)
baseline = td_error
profitAdvanced_list.append(td_error[0][0])
loss = brain.actorLearn(state_tf, action, td_error)
# print(loss)
Loss_list.append(loss)

if today >= 0:
wait_day.append(today)
td_error = brain.criticLearn(state_tf, reward[1], state_)
baseline = td_error
profitAdvanced_list.append(td_error[0][0])
loss = brain.actorLearn(state_tf, action, td_error)
# print(loss)
Loss_list.append(loss)

# 保存记录到记忆库
# print("this is store arg:",state_tf,";", action,";", reward,";", env.getTodayIndex())
# brain.store_transition(state_tf, action, reward, env.getTodayIndex())
Expand All @@ -105,8 +109,8 @@ def transcate_AC(self):
state = next_state

# 一局的总收益
epsilon = self.epsilon*(ex_steps/500)
print("epsilon:",epsilon)
epsilon = self.epsilon*(ex_steps/100)
print("epsilon:",ex_steps)
print("TD_Error:",baseline)
profit = self.env.getTotalReward()
profit_list.append(profit)
Expand Down Expand Up @@ -155,6 +159,8 @@ def writeHistory(self, filename, epsilon, baseline, total_steps, profit_list, pr
f.write("########################" + str(gameNum) + "###########################\n")
f.flush()



if __name__ == "__main__":
P = TikcetPlay()
P.transcate_AC()
24 changes: 20 additions & 4 deletions algorithm/BrainAC.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,27 @@ def __init__(
self.sess.run(tf.global_variables_initializer())

def choose_action(self, state, length):
return self.actor.choose_action(state, length)
stateP = self.statePreprocess(state)
return self.actor.choose_action(stateP, length)

def criticLearn(self, state, reward, state_):
return self.critic.learn(state, reward, state_)
stateP = self.statePreprocess(state)
state_P = self.statePreprocess(state_)
return self.critic.learn(stateP, reward, state_P)

def actorLearn(self, state, action, td_error):
return self.actor.learn(state, action, td_error)
stateP = self.statePreprocess(state)
return self.actor.learn(stateP, action, td_error)

def statePreprocess(self,state):
exist = np.array([n for n in state if n>0])
exist -= 1877.368
exist /= 256.61
exist = exist.tolist()
while len(exist) < 87:
exist.append(0)
# print("STATE:", self.state,"EXIST:",exist)
return exist

class Actor(object):
def __init__(self, sess, n_features, n_actions, lr=0.001):
Expand Down Expand Up @@ -84,11 +98,13 @@ def __init__(self, sess, n_features, n_actions, lr=0.001):
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v)

def learn(self, s, a, td):
s = np.mat(s)
feed_dict = {self.s: s, self.a: a, self.td_error: td}
_, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
return exp_v

def choose_action(self, s, length):
s = np.mat(s)
probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions
# print(probs)
action = np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int
Expand Down Expand Up @@ -142,7 +158,7 @@ def learn(self, s, r, s_):
# s_ = s_[np.newaxis, :]
v_ = self.sess.run(self.v, {self.s: s_})
# print("V(S):",v_)

s = np.mat(s)


td_error, _ = self.sess.run([self.td_error, self.train_op],
Expand Down

0 comments on commit a1c8a61

Please sign in to comment.