fix selecting action

MorvanZhou · MorvanZhou · commit 1faef74e7ddf · 2018-09-03T11:10:24.000+08:00
diff --git a/contents/2_Q_Learning_maze/RL_brain.py b/contents/2_Q_Learning_maze/RL_brain.py
@@ -23,8 +23,8 @@ def choose_action(self, observation):
         if np.random.uniform() < self.epsilon:
             # choose best action
             state_action = self.q_table.loc[observation, :]
-            state_action = state_action.reindex(np.random.permutation(state_action.index))     # some actions have same value
-            action = state_action.idxmax()
+            # some actions may have the same value, randomly choose on in these actions
+            action = np.random.choice(state_action[state_action == np.max(state_action)].index)
         else:
             # choose random action
             action = np.random.choice(self.actions)
diff --git a/contents/3_Sarsa_maze/RL_brain.py b/contents/3_Sarsa_maze/RL_brain.py
@@ -35,8 +35,8 @@ def choose_action(self, observation):
         if np.random.rand() < self.epsilon:
             # choose best action
             state_action = self.q_table.loc[observation, :]
-            state_action = state_action.reindex(np.random.permutation(state_action.index))     # some actions have same value
-            action = state_action.idxmax()
+            # some actions may have the same value, randomly choose on in these actions
+            action = np.random.choice(state_action[state_action == np.max(state_action)].index)
         else:
             # choose random action
             action = np.random.choice(self.actions)
diff --git a/contents/4_Sarsa_lambda_maze/RL_brain.py b/contents/4_Sarsa_lambda_maze/RL_brain.py
@@ -35,8 +35,8 @@ def choose_action(self, observation):
         if np.random.rand() < self.epsilon:
             # choose best action
             state_action = self.q_table.loc[observation, :]
-            state_action = state_action.reindex(np.random.permutation(state_action.index))     # some actions have same value
-            action = state_action.idxmax()
+            # some actions may have the same value, randomly choose on in these actions
+            action = np.random.choice(state_action[state_action == np.max(state_action)].index)
         else:
             # choose random action
             action = np.random.choice(self.actions)