reinforcement_q_learning: Remove hard-coded entries for action space. (pytorch#452)

edowson · soumith · commit 9e6c8ba21812 · 2019-03-25T21:36:18.000-04:00
This commit removes hard-coded entries for the output action space
and gets the value directly from the gym environment.

Signed-off-by: Elvis Dowson &lt;elvis.dowson@gmail.com&gt;
diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
@@ -208,7 +208,7 @@ def __len__(self):
 
 class DQN(nn.Module):
 
-    def __init__(self, h, w):
+    def __init__(self, h, w, outputs):
         super(DQN, self).__init__()
         self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
         self.bn1 = nn.BatchNorm2d(16)
@@ -224,7 +224,7 @@ def conv2d_size_out(size, kernel_size = 5, stride = 2):
         convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
         convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
         linear_input_size = convw * convh * 32
-        self.head = nn.Linear(linear_input_size, 2) # 448 or 512
+        self.head = nn.Linear(linear_input_size, outputs)
 
     # Called with either one element to determine next action, or a batch
     # during optimization. Returns tensor([[left0exp,right0exp]...]).
@@ -324,8 +324,11 @@ def get_screen():
 init_screen = get_screen()
 _, _, screen_height, screen_width = init_screen.shape
 
-policy_net = DQN(screen_height, screen_width).to(device)
-target_net = DQN(screen_height, screen_width).to(device)
+# Get number of actions from gym action space
+n_actions = env.action_space.n
+
+policy_net = DQN(screen_height, screen_width, n_actions).to(device)
+target_net = DQN(screen_height, screen_width, n_actions).to(device)
 target_net.load_state_dict(policy_net.state_dict())
 target_net.eval()
 
@@ -349,7 +352,7 @@ def select_action(state):
             # found, so we pick action with the larger expected reward.
             return policy_net(state).max(1)[1].view(1, 1)
     else:
-        return torch.tensor([[random.randrange(2)]], device=device, dtype=torch.long)
+        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)
 
 
 episode_durations = []