[IBR-2091] Convet action type to numpy array in select_action function

isk03276 · isk03276 · commit 8eaeb2ebb579 · 2021-07-20T11:37:57.000+09:00
diff --git a/rl_algorithms/gail/agent.py b/rl_algorithms/gail/agent.py
@@ -168,7 +168,10 @@ def train(self):
                 # gail reward (imitation reward)
                 gail_reward = compute_gail_reward(
                     self.learner.discriminator(
-                        (numpy2floattensor(state, self.learner.device), action)
+                        (
+                            numpy2floattensor(state, self.learner.device),
+                            numpy2floattensor(action, self.learner.device),
+                        )
                     )
                 )
 
diff --git a/rl_algorithms/ppo/agent.py b/rl_algorithms/ppo/agent.py
@@ -135,6 +135,7 @@ def select_action(self, state: np.ndarray) -> torch.Tensor:
         with torch.no_grad():
             state = numpy2floattensor(state, self.learner.device)
             selected_action, dist = self.learner.actor(state)
+            selected_action = selected_action.detach()
             log_prob = dist.log_prob(selected_action)
             value = self.learner.critic(state)
 
@@ -155,7 +156,7 @@ def select_action(self, state: np.ndarray) -> torch.Tensor:
                 self.values.append(value)
                 self.log_probs.append(_log_prob)
 
-        return selected_action
+        return selected_action.detach().cpu().numpy()
 
     def step(
         self, action: Union[np.ndarray, torch.Tensor]

Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,10 @@ def train(self):`
`168`	`168`	`# gail reward (imitation reward)`
`169`	`169`	`gail_reward = compute_gail_reward(`
`170`	`170`	`self.learner.discriminator(`
`171`		`- (numpy2floattensor(state, self.learner.device), action)`
	`171`	`+ (`
	`172`	`+ numpy2floattensor(state, self.learner.device),`
	`173`	`+ numpy2floattensor(action, self.learner.device),`
	`174`	`+ )`
`172`	`175`	`)`
`173`	`176`	`)`
`174`	`177`