@@ -269,18 +269,20 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo
269
269
"""
270
270
271
271
info = new_info [self .brain_name ]
272
- last_info = current_info [self .brain_name ]
273
272
for l in range (len (info .agents )):
274
273
agent_actions = self .training_buffer [info .agents [l ]]['actions' ]
275
274
if ((info .local_done [l ] or len (agent_actions ) > self .trainer_parameters ['time_horizon' ])
276
275
and len (agent_actions ) > 0 ):
276
+ agent_id = info .agents [l ]
277
277
if info .local_done [l ] and not info .max_reached [l ]:
278
278
value_next = 0.0
279
279
else :
280
280
if info .max_reached [l ]:
281
- bootstrapping_info = last_info
281
+ bootstrapping_info = self .training_buffer [agent_id ].last_brain_info
282
+ idx = bootstrapping_info .agents .index (agent_id )
282
283
else :
283
284
bootstrapping_info = info
285
+ idx = l
284
286
feed_dict = {self .model .batch_size : len (bootstrapping_info .vector_observations ), self .model .sequence_length : 1 }
285
287
if self .use_observations :
286
288
for i in range (len (bootstrapping_info .visual_observations )):
@@ -293,8 +295,7 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo
293
295
feed_dict [self .model .memory_in ] = bootstrapping_info .memories
294
296
if not self .is_continuous_action and self .use_recurrent :
295
297
feed_dict [self .model .prev_action ] = np .reshape (bootstrapping_info .previous_vector_actions , [- 1 ])
296
- value_next = self .sess .run (self .model .value , feed_dict )[l ]
297
- agent_id = info .agents [l ]
298
+ value_next = self .sess .run (self .model .value , feed_dict )[idx ]
298
299
299
300
self .training_buffer [agent_id ]['advantages' ].set (
300
301
get_gae (
0 commit comments