@@ -75,9 +75,9 @@ def conjugate_gradient(net, states, loss_grad, n_step=10, residual_tol=1e-10):
75
75
break
76
76
return x
77
77
78
- class QNet (nn .Module ):
78
+ class TRPO (nn .Module ):
79
79
def __init__ (self , num_inputs , num_outputs ):
80
- super (QNet , self ).__init__ ()
80
+ super (TRPO , self ).__init__ ()
81
81
self .t = 0
82
82
self .num_inputs = num_inputs
83
83
self .num_outputs = num_outputs
@@ -90,19 +90,27 @@ def __init__(self, num_inputs, num_outputs):
90
90
nn .init .xavier_uniform (m .weight )
91
91
92
92
def forward (self , input ):
93
- x = torch .tanh (self .fc_1 (input ))
93
+ x = torch .relu (self .fc_1 (input ))
94
94
policy = F .softmax (self .fc_2 (x ))
95
95
96
96
return policy
97
97
98
98
@classmethod
99
- def train_model (cls , net , transitions , k ):
100
- states , actions , rewards , masks = transitions
99
+ def train_model (cls , net , transitions ):
100
+ states , actions , rewards , masks = transitions .state , transitions .action , transitions .reward , transitions .mask
101
+
101
102
states = torch .stack (states )
102
103
actions = torch .stack (actions )
103
104
rewards = torch .Tensor (rewards )
104
105
masks = torch .Tensor (masks )
105
106
107
+ returns = torch .zeros_like (rewards )
108
+
109
+ running_return = 0
110
+ for t in reversed (range (len (rewards ))):
111
+ running_return = rewards [t ] + gamma * running_return * masks [t ]
112
+ returns [t ] = running_return
113
+
106
114
policy = net (states )
107
115
policy = policy .view (- 1 , net .num_outputs )
108
116
policy_action = (policy * actions .detach ()).sum (dim = 1 )
@@ -111,7 +119,7 @@ def train_model(cls, net, transitions, k):
111
119
old_policy = old_policy .view (- 1 , net .num_outputs )
112
120
old_policy_action = (old_policy * actions .detach ()).sum (dim = 1 )
113
121
114
- surrogate_loss = ((policy_action / old_policy_action ) * rewards ).mean ()
122
+ surrogate_loss = ((policy_action / old_policy_action ) * returns ).mean ()
115
123
116
124
surrogate_loss_grad = torch .autograd .grad (surrogate_loss , net .parameters ())
117
125
surrogate_loss_grad = flat_grad (surrogate_loss_grad )
@@ -130,7 +138,7 @@ def train_model(cls, net, transitions, k):
130
138
policy = net (states )
131
139
policy = policy .view (- 1 , net .num_outputs )
132
140
policy_action = (policy * actions .detach ()).sum (dim = 1 )
133
- surrogate_loss = ((policy_action / old_policy_action ) * rewards ).mean ()
141
+ surrogate_loss = ((policy_action / old_policy_action ) * returns ).mean ()
134
142
135
143
kl = kl_divergence (policy , old_policy )
136
144
kl = kl .mean ()
@@ -144,6 +152,6 @@ def train_model(cls, net, transitions, k):
144
152
def get_action (self , input ):
145
153
policy = self .forward (input )
146
154
policy = policy [0 ].data .numpy ()
147
-
155
+
148
156
action = np .random .choice (self .num_outputs , 1 , p = policy )[0 ]
149
157
return action
0 commit comments