1
+ """
2
+ Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
3
+ DDPG is Actor Critic based algorithm.
4
+ Pendulum example.
5
+
6
+ View more on my tutorial page: https://morvanzhou.github.io/tutorials/
7
+
8
+ Using:
9
+ tensorflow 1.0
10
+ gym 0.8.0
11
+ """
12
+
13
+ import tensorflow as tf
14
+ import numpy as np
15
+ import gym
16
+ import time
17
+
18
+
19
+ ##################### hyper parameters ####################
20
+
21
+ MAX_EPISODES = 200
22
+ MAX_EP_STEPS = 200
23
+ LR_A = 0.001 # learning rate for actor
24
+ LR_C = 0.002 # learning rate for critic
25
+ GAMMA = 0.9 # reward discount
26
+ TAU = 0.01 # soft replacement
27
+ MEMORY_CAPACITY = 10000
28
+ BATCH_SIZE = 32
29
+
30
+ RENDER = False
31
+ ENV_NAME = 'Pendulum-v0'
32
+
33
+
34
+ ############################### DDPG ####################################
35
+
36
+
37
+ class DDPG (object ):
38
+ def __init__ (self , a_dim , s_dim , a_bound ,):
39
+ self .memory = np .zeros ((MEMORY_CAPACITY , s_dim * 2 + a_dim + 1 ), dtype = np .float32 )
40
+ self .pointer = 0
41
+ self .sess = tf .Session ()
42
+
43
+ self .a_dim , self .s_dim , self .a_bound = a_dim , s_dim , a_bound ,
44
+ self .S = tf .placeholder (tf .float32 , [None , s_dim ], 's' )
45
+ self .S_ = tf .placeholder (tf .float32 , [None , s_dim ], 's_' )
46
+ self .R = tf .placeholder (tf .float32 , [None , 1 ], 'r' )
47
+
48
+ ema = tf .train .ExponentialMovingAverage (decay = 1 - TAU )
49
+
50
+ def ema_getter (getter , name , * args , ** kwargs ):
51
+ return ema .average (getter (name , * args , ** kwargs ))
52
+
53
+ self .a = self ._build_a (self .S ,)
54
+ a_params = tf .get_collection (tf .GraphKeys .GLOBAL_VARIABLES , scope = 'Actor' )
55
+
56
+ # assign self.a = a in memory when calculating q for td_error,
57
+ # otherwise the self.a is from Actor when updating Actor
58
+ q = self ._build_c (self .S , self .a ,)
59
+ c_params = tf .get_collection (tf .GraphKeys .GLOBAL_VARIABLES , scope = 'Critic' )
60
+
61
+ target_update = [ema .apply (a_params ), ema .apply (c_params )]
62
+ a_ = self ._build_a (self .S_ , reuse = True , custom_getter = ema_getter )
63
+ q_ = self ._build_c (self .S_ , a_ , reuse = True , custom_getter = ema_getter )
64
+
65
+ with tf .control_dependencies (target_update ):
66
+ q_target = self .R + GAMMA * q_
67
+ # in the feed_dict for the td_error, the self.a should change to actions in memory
68
+ td_error = tf .losses .mean_squared_error (labels = q_target , predictions = q )
69
+ a_loss = - tf .reduce_mean (q ) # maximize the q
70
+ self .atrain = tf .train .AdamOptimizer (LR_A ).minimize (a_loss , var_list = a_params )
71
+ self .ctrain = tf .train .AdamOptimizer (LR_C ).minimize (td_error , var_list = c_params )
72
+
73
+ self .sess .run (tf .global_variables_initializer ())
74
+
75
+ def choose_action (self , s ):
76
+ return self .sess .run (self .a , {self .S : s [np .newaxis , :]})[0 ]
77
+
78
+ def learn (self ):
79
+ indices = np .random .choice (MEMORY_CAPACITY , size = BATCH_SIZE )
80
+ bt = self .memory [indices , :]
81
+ bs = bt [:, :self .s_dim ]
82
+ ba = bt [:, self .s_dim : self .s_dim + self .a_dim ]
83
+ br = bt [:, - self .s_dim - 1 : - self .s_dim ]
84
+ bs_ = bt [:, - self .s_dim :]
85
+
86
+ self .sess .run (self .atrain , {self .S : bs })
87
+ self .sess .run (self .ctrain , {self .S : bs , self .a : ba , self .R : br , self .S_ : bs_ })
88
+
89
+ def store_transition (self , s , a , r , s_ ):
90
+ transition = np .hstack ((s , a , [r ], s_ ))
91
+ index = self .pointer % MEMORY_CAPACITY # replace the old memory with new memory
92
+ self .memory [index , :] = transition
93
+ self .pointer += 1
94
+
95
+ def _build_a (self , s , reuse = None , custom_getter = None ):
96
+ trainable = True if reuse is None else False
97
+ with tf .variable_scope ('Actor' , reuse = reuse , custom_getter = custom_getter ):
98
+ net = tf .layers .dense (s , 30 , activation = tf .nn .relu , name = 'l1' , trainable = trainable )
99
+ a = tf .layers .dense (net , self .a_dim , activation = tf .nn .tanh , name = 'a' , trainable = trainable )
100
+ return tf .multiply (a , self .a_bound , name = 'scaled_a' )
101
+
102
+ def _build_c (self , s , a , reuse = None , custom_getter = None ):
103
+ trainable = True if reuse is None else False
104
+ with tf .variable_scope ('Critic' , reuse = reuse , custom_getter = custom_getter ):
105
+ n_l1 = 30
106
+ w1_s = tf .get_variable ('w1_s' , [self .s_dim , n_l1 ], trainable = trainable )
107
+ w1_a = tf .get_variable ('w1_a' , [self .a_dim , n_l1 ], trainable = trainable )
108
+ b1 = tf .get_variable ('b1' , [1 , n_l1 ], trainable = trainable )
109
+ net = tf .nn .relu (tf .matmul (s , w1_s ) + tf .matmul (a , w1_a ) + b1 )
110
+ return tf .layers .dense (net , 1 , trainable = trainable ) # Q(s,a)
111
+
112
+
113
+ ############################### training ####################################
114
+
115
+
116
+ env = gym .make (ENV_NAME )
117
+ env = env .unwrapped
118
+ env .seed (1 )
119
+
120
+ s_dim = env .observation_space .shape [0 ]
121
+ a_dim = env .action_space .shape [0 ]
122
+ a_bound = env .action_space .high
123
+
124
+ ddpg = DDPG (a_dim , s_dim , a_bound )
125
+
126
+ var = 3 # control exploration
127
+ t1 = time .time ()
128
+ for i in range (MAX_EPISODES ):
129
+ s = env .reset ()
130
+ ep_reward = 0
131
+ for j in range (MAX_EP_STEPS ):
132
+ if RENDER :
133
+ env .render ()
134
+
135
+ # Add exploration noise
136
+ a = ddpg .choose_action (s )
137
+ a = np .clip (np .random .normal (a , var ), - 2 , 2 ) # add randomness to action selection for exploration
138
+ s_ , r , done , info = env .step (a )
139
+
140
+ ddpg .store_transition (s , a , r / 10 , s_ )
141
+
142
+ if ddpg .pointer > MEMORY_CAPACITY :
143
+ var *= .9995 # decay the action randomness
144
+ ddpg .learn ()
145
+
146
+ s = s_
147
+ ep_reward += r
148
+ if j == MAX_EP_STEPS - 1 :
149
+ print ('Episode:' , i , ' Reward: %i' % int (ep_reward ), 'Explore: %.2f' % var , )
150
+ # if ep_reward > -300:RENDER = True
151
+ break
152
+
153
+ print ('Running time: ' , time .time () - t1 )
0 commit comments