Skip to content

Commit

Permalink
fixed reward leaking bug
Browse files Browse the repository at this point in the history
mentioned in issue #8
  • Loading branch information
nikhilbarhate99 authored Sep 20, 2019
1 parent fd5381d commit d02da8d
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions PPO_continuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ def __init__(self):
self.states = []
self.logprobs = []
self.rewards = []
self.is_terminals = []

def clear_memory(self):
del self.actions[:]
del self.states[:]
del self.logprobs[:]
del self.rewards[:]
del self.is_terminals[:]

class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, action_std):
Expand Down Expand Up @@ -94,9 +96,11 @@ def update(self, memory):
# Monte Carlo estimate of rewards:
rewards = []
discounted_reward = 0
for reward in reversed(memory.rewards):
for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
discounted_reward = reward + (self.gamma * discounted_reward)
rewards.insert(0, discounted_reward)
if is_terminal:
discounted_reward = 0

# Normalizing the rewards:
rewards = torch.tensor(rewards).to(device)
Expand Down Expand Up @@ -178,8 +182,10 @@ def main():
# Running policy_old:
action = ppo.select_action(state, memory)
state, reward, done, _ = env.step(action)
# Saving reward:

# Saving reward and is_terminals:
memory.rewards.append(reward)
memory.is_terminals.append(done)

# update if its time
if time_step % update_timestep == 0:
Expand Down

0 comments on commit d02da8d

Please sign in to comment.