comment

User · User · commit 853be19fd339 · 2021-03-24T19:09:24.000-04:00
diff --git a/rl/monte_carlo.py b/rl/monte_carlo.py
@@ -40,6 +40,10 @@ def play_game(grid, policy, max_steps=20):
       break
 
   # calculate the returns by working backwards from the terminal state
+
+  # we want to return:
+  # states  = [s(0), s(1), ..., s(T-1)]
+  # returns = [G(0), G(1), ..., G(T-1)]
   G = 0
   states_and_returns = []
   first = True
diff --git a/rl/monte_carlo2.py b/rl/monte_carlo2.py
@@ -48,6 +48,10 @@ def play_game(grid, policy, max_steps=20):
     # note: there is no need to store the final terminal state
     s = next_s
 
+  # we want to return:
+  # states  = [s(0), s(1), ..., S(T-1)]
+  # rewards = [R(1), R(2), ..., R(T)  ]
+
   return states, rewards