update

bob7783 · bob7783 · commit 7f68dd31b7d1 · 2020-05-29T16:53:35.000-04:00
diff --git a/rl/grid_world.py b/rl/grid_world.py
@@ -5,10 +5,12 @@
 # Note: you may need to update your version of future
 # sudo pip install -U future
 
-
 import numpy as np
 
 
+ACTION_SPACE = ('U', 'D', 'L', 'R')
+
+
 class Grid: # Environment
   def __init__(self, rows, cols, start):
     self.rows = rows
@@ -32,6 +34,22 @@ def current_state(self):
   def is_terminal(self, s):
     return s not in self.actions
 
+  def get_next_state(self, s, a):
+    # this answers: where would I end up if I perform action 'a' in state 's'?
+    i, j = s[0], s[1]
+
+    # if this action moves you somewhere else, then it will be in this dictionary
+    if a in self.actions[(i, j)]:
+      if a == 'U':
+        i -= 1
+      elif a == 'D':
+        i += 1
+      elif a == 'R':
+        j += 1
+      elif a == 'L':
+        j -= 1
+    return i, j
+
   def move(self, action):
     # check if legal move first
     if action in self.actions[(self.i, self.j)]:
@@ -116,3 +134,153 @@ def negative_grid(step_cost=-0.1):
   })
   return g
 
+
+
+
+
+class WindyGrid:
+  def __init__(self, rows, cols, start):
+    self.rows = rows
+    self.cols = cols
+    self.i = start[0]
+    self.j = start[1]
+
+  def set(self, rewards, actions, probs):
+    # rewards should be a dict of: (i, j): r (row, col): reward
+    # actions should be a dict of: (i, j): A (row, col): list of possible actions
+    self.rewards = rewards
+    self.actions = actions
+    self.probs = probs
+
+  def set_state(self, s):
+    self.i = s[0]
+    self.j = s[1]
+
+  def current_state(self):
+    return (self.i, self.j)
+
+  def is_terminal(self, s):
+    return s not in self.actions
+
+  def move(self, action):
+    s = (self.i, self.j)
+    a = action
+
+    next_state_probs = self.probs[(s, a)]
+    next_states = list(next_state_probs.keys())
+    next_probs = list(next_state_probs.values())
+    s2 = np.random.choice(next_states, p=next_probs)
+
+    # update the current state
+    self.i, self.j = s2
+
+    # return a reward (if any)
+    return self.rewards.get(s2, 0)
+
+  def game_over(self):
+    # returns true if game is over, else false
+    # true if we are in a state where no actions are possible
+    return (self.i, self.j) not in self.actions
+
+  def all_states(self):
+    # possibly buggy but simple way to get all states
+    # either a position that has possible next actions
+    # or a position that yields a reward
+    return set(self.actions.keys()) | set(self.rewards.keys())
+
+
+def windy_grid():
+  g = WindyGrid(3, 4, (2, 0))
+  rewards = {(0, 3): 1, (1, 3): -1}
+  actions = {
+    (0, 0): ('D', 'R'),
+    (0, 1): ('L', 'R'),
+    (0, 2): ('L', 'D', 'R'),
+    (1, 0): ('U', 'D'),
+    (1, 2): ('U', 'D', 'R'),
+    (2, 0): ('U', 'R'),
+    (2, 1): ('L', 'R'),
+    (2, 2): ('L', 'R', 'U'),
+    (2, 3): ('L', 'U'),
+  }
+
+  # p(s' | s, a) represented as:
+  # KEY: (s, a) --> VALUE: {s': p(s' | s, a)}
+  probs = {
+    ((2, 0), 'U'): {(1, 0): 1.0},
+    ((2, 0), 'D'): {(2, 0): 1.0},
+    ((2, 0), 'L'): {(2, 0): 1.0},
+    ((2, 0), 'R'): {(2, 1): 1.0},
+    ((1, 0), 'U'): {(0, 0): 1.0},
+    ((1, 0), 'D'): {(2, 0): 1.0},
+    ((1, 0), 'L'): {(1, 0): 1.0},
+    ((1, 0), 'R'): {(1, 0): 1.0},
+    ((0, 0), 'U'): {(0, 0): 1.0},
+    ((0, 0), 'D'): {(1, 0): 1.0},
+    ((0, 0), 'L'): {(0, 0): 1.0},
+    ((0, 0), 'R'): {(0, 1): 1.0},
+    ((0, 1), 'U'): {(0, 1): 1.0},
+    ((0, 1), 'D'): {(0, 1): 1.0},
+    ((0, 1), 'L'): {(0, 0): 1.0},
+    ((0, 1), 'R'): {(0, 2): 1.0},
+    ((0, 2), 'U'): {(0, 2): 1.0},
+    ((0, 2), 'D'): {(1, 2): 1.0},
+    ((0, 2), 'L'): {(0, 1): 1.0},
+    ((0, 2), 'R'): {(0, 3): 1.0},
+    ((2, 1), 'U'): {(2, 1): 1.0},
+    ((2, 1), 'D'): {(2, 1): 1.0},
+    ((2, 1), 'L'): {(2, 0): 1.0},
+    ((2, 1), 'R'): {(2, 2): 1.0},
+    ((2, 2), 'U'): {(1, 2): 1.0},
+    ((2, 2), 'D'): {(2, 2): 1.0},
+    ((2, 2), 'L'): {(2, 1): 1.0},
+    ((2, 2), 'R'): {(2, 3): 1.0},
+    ((2, 3), 'U'): {(1, 3): 1.0},
+    ((2, 3), 'D'): {(2, 3): 1.0},
+    ((2, 3), 'L'): {(2, 2): 1.0},
+    ((2, 3), 'R'): {(2, 3): 1.0},
+    ((1, 2), 'U'): {(0, 2): 0.5, (1, 3): 0.5},
+    ((1, 2), 'D'): {(2, 2): 1.0},
+    ((1, 2), 'L'): {(1, 2): 1.0},
+    ((1, 2), 'R'): {(1, 3): 1.0},
+  }
+  g.set(rewards, actions, probs)
+  return g
+
+
+
+
+def grid_5x5(step_cost=-0.1):
+  g = Grid(5, 5, (4, 0))
+  rewards = {(0, 4): 1, (1, 4): -1}
+  actions = {
+    (0, 0): ('D', 'R'),
+    (0, 1): ('L', 'R'),
+    (0, 2): ('L', 'R'),
+    (0, 3): ('L', 'D', 'R'),
+    (1, 0): ('U', 'D', 'R'),
+    (1, 1): ('U', 'D', 'L'),
+    (1, 3): ('U', 'D', 'R'),
+    (2, 0): ('U', 'D', 'R'),
+    (2, 1): ('U', 'L', 'R'),
+    (2, 2): ('L', 'R', 'D'),
+    (2, 3): ('L', 'R', 'U'),
+    (2, 4): ('L', 'U', 'D'),
+    (3, 0): ('U', 'D'),
+    (3, 2): ('U', 'D'),
+    (3, 4): ('U', 'D'),
+    (4, 0): ('U', 'R'),
+    (4, 1): ('L', 'R'),
+    (4, 2): ('L', 'R', 'U'),
+    (4, 3): ('L', 'R'),
+    (4, 4): ('L', 'U'),
+  }
+  g.set(rewards, actions)
+
+  # non-terminal states
+  visitable_states = actions.keys()
+  for s in visitable_states:
+    g.rewards[s] = step_cost
+
+  return g
+