Merge branch 'master' into qiwye/lightrnn

prime9 · Jul 23, 2017 · 4a272fb · 4a272fb
2 parents d5a18ba + 1e61898
commit 4a272fb
Show file tree

Hide file tree

Showing 68 changed files with 4,521 additions and 6 deletions.
diff --git a/Examples/ReinforcementLearning/deeprl/README.md b/Examples/ReinforcementLearning/deeprl/README.md
@@ -0,0 +1,41 @@
+Examples of running CNTK DeepRL toolkit.
+
+Dependency:
+    - OpenAI Gym: https://gym.openai.com/docs
+    - Atari: https://github.com/openai/gym#atari
+             Use the following command to install Atari games on Windows:
+                pip install git+https://github.com/Kojoley/atari-py.git
+
+The following commands assume Examples/ReinforcementLearning/deeprl/scripts as the working directory.
+
+To train an agent using
+    - TabularQLearning
+    python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/tabular_qlearning.config --eval_period=1000 --eval_steps=20000
+
+    - QLearning
+    python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/qlearning.config --eval_period=1000 --eval_steps=20000
+
+    - ActorCritic
+    python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/policy_gradient.config --eval_period=1000 --eval_steps=20000
+
+    - RandomAgent
+    python run.py --env=CartPole-v0 --max_steps=100 --eval_period=1 --eval_steps=200000
+
+Use QLearning as an example, the command
+    python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/qlearning.config --eval_period=1000 --eval_steps=20000
+tells QLearning agent to interact with environment CartPole-v0 for a maximum of
+100000 steps, while evaluation is done every 1000 steps. Each evaluation reports
+average reward per episode by interacting with the environment 20000 steps.
+
+The agent configs, best model and evaluation results are written to --output_dir,
+which defaults to 'output' in the working directory. To view the evaluation
+results, type the following command in python:
+
+import shelve
+d = shelve.open('output/output.wks')
+d['reward_history']
+d.close()
+
+Note, reading and writing wks simultaneously will corrupt the file. To
+check your results while the program is still running, make a copy of wks file
+and read the numbers from the copy.
diff --git a/Examples/ReinforcementLearning/deeprl/env/__init__.py b/Examples/ReinforcementLearning/deeprl/env/__init__.py
diff --git a/Examples/ReinforcementLearning/deeprl/env/env_factory.py b/Examples/ReinforcementLearning/deeprl/env/env_factory.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+from gym import envs
+
+from . import maze2d, puddleworld
+
+
+def register_env(env_id):
+    if env_id == 'Maze2D-v0':
+        envs.register(
+            id=env_id,
+            entry_point='env:maze2d.Maze2D',
+            kwargs={},
+            max_episode_steps=200,
+            reward_threshold=-110.0)
+    elif env_id == 'PuddleWorld-v0':
+        envs.register(
+            id=env_id,
+            entry_point='env:puddleworld.PuddleWorld',
+            kwargs={},
+            max_episode_steps=200,
+            reward_threshold=-100.0)
+    else:
+        raise ValueError('Cannot find environment "{0}"\n'.format(env_id))
+    return True
diff --git a/Examples/ReinforcementLearning/deeprl/env/maze2d.py b/Examples/ReinforcementLearning/deeprl/env/maze2d.py
@@ -0,0 +1,95 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import gym
+import numpy as np
+from gym import spaces
+from gym.utils import seeding
+
+
+class Maze2D(gym.Env):
+    """This class creates a maze problem given a map."""
+
+    metadata = {
+        'render.modes': ['human', 'rgb_array'],
+        'video.frames_per_second': 30
+    }
+
+    def __init__(self):
+        self._load_map()
+        self.viewer = None
+        self.action_space = spaces.Discrete(4)
+        self.observation_space = spaces.Discrete(self.room_lengths[0] *
+                                                 self.room_lengths[1])
+        self._seed()
+        self._reset()
+
+    def _seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def _step(self, action):
+        assert self.action_space.contains(action), "%r (%s) invalid" % (
+            action, type(action))
+
+        if (np.random.uniform(0, 1) > self.motion_noise):
+            state0 = self.state[0]
+            state1 = self.state[1]
+            if action == 0:  # north
+                state1 = np.minimum(self.room_lengths[1] - 1, state1 + 1)
+            elif action == 1:  # east
+                state0 = np.minimum(self.room_lengths[0] - 1, state0 + 1)
+            elif action == 2:  # south
+                state1 = np.maximum(0, state1 - 1)
+            else:  # west
+                state0 = np.maximum(0, state0 - 1)
+            if not ([state0, state1] in self.wall_states):
+                self.state[0] = state0
+                self.state[1] = state1
+
+        done = self._is_goal(self.state)
+        reward = -1.0
+        return self._encode_state(self.state), reward, done, {}
+
+    def _reset(self):
+        rnd_index = np.random.randint(0, len(self.initial_states))
+        self.state = self.initial_states[rnd_index][:]
+        return self._encode_state(self.state)
+
+    def _load_map(self):
+        self.room_lengths = np.array([25, 25])
+        self.initial_states = [[0, 0]]
+        self.goal_states = [[24, 24]]
+        self.wall_states = []
+        self._build_wall([2, 0], [2, 15])
+        self._build_wall([5, 10], [5, 20])
+        self._build_wall([5, 12], [13, 12])
+        self._build_wall([15, 5], [15, 24])
+        self._build_wall([10, 5], [22, 5])
+        self.num_states = self.room_lengths[0] * self.room_lengths[1]
+        self.motion_noise = 0.05
+
+    def _is_goal(self, state):
+        return self.state in self.goal_states
+
+    def _encode_state(self, state):
+        return int(state[1] * self.room_lengths[0] + state[0])
+
+    def _build_wall(self, start, end):
+        x_min = np.maximum(0, np.minimum(start[0], end[0]))
+        x_max = np.minimum(self.room_lengths[0] - 1,
+                           np.maximum(start[0], end[0]))
+        y_min = np.maximum(0, np.minimum(start[1], end[1]))
+        y_max = np.minimum(self.room_lengths[1] - 1,
+                           np.maximum(start[1], end[1]))
+        for x in range(x_min, x_max + 1):
+            for y in range(y_min, y_max + 1):
+                if not ([x, y] in self.goal_states or
+                        [x, y] in self.initial_states):
+                    self.wall_states.append([x, y])
+
+    def _render(self, mode='human', close=False):
+        pass
diff --git a/Examples/ReinforcementLearning/deeprl/env/puddleworld.py b/Examples/ReinforcementLearning/deeprl/env/puddleworld.py
@@ -0,0 +1,102 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import gym
+import numpy as np
+from gym import spaces
+from gym.utils import seeding
+
+
+class PuddleWorld(gym.Env):
+    """This class creates a continous-state maze problem given a map."""
+
+    metadata = {
+        'render.modes': ['human', 'rgb_array'],
+        'video.frames_per_second': 30
+    }
+
+    def __init__(self):
+        self._load_map()
+        self.viewer = None
+        self.action_space = spaces.Discrete(4)
+        self.observation_space = spaces.Box(np.zeros(2), self.room_lengths)
+        self._seed()
+        self._reset()
+
+    def _seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def _step(self, action):
+        assert self.action_space.contains(action), "%r (%s) invalid" % (
+            action, type(action))
+
+        if (np.random.uniform(0., 1.) > self.motion_noise):
+            state0 = self.state[0]
+            state1 = self.state[1]
+            # Motion length is a truncated normal random variable.
+            motion_length = np.maximum(
+                0.,
+                np.minimum(
+                    self.motion_max,
+                    np.random.normal(self.motion_mean, self.motion_std)))
+            if action == 0:  # north
+                state1 = np.minimum(self.room_lengths[1],
+                                    state1 + motion_length)
+            elif action == 1:  # east
+                state0 = np.minimum(self.room_lengths[0],
+                                    state0 + motion_length)
+            elif action == 2:  # south
+                state1 = np.maximum(0., state1 - motion_length)
+            else:  # west
+                state0 = np.maximum(0., state0 - motion_length)
+            self.state[0] = state0
+            self.state[1] = state1
+
+        done = self._is_goal(self.state)
+        reward = self._compute_reward(self.state)
+        return self.state, reward, done, {}
+
+    def _reset(self):
+        self.state = np.copy(self.initial_state)
+        return self.state
+
+    def _load_map(self):
+        self.room_lengths = np.array([1., 1.])
+        self.initial_state = np.array([0., 0.])
+        self.goal_state = np.array([1., 1.])
+        self.goal_width = 0.01
+        self.motion_noise = 0.05  # probability of no-motion (staying in same state)
+        self.motion_mean = 0.1  # mean of motion length
+        self.motion_std = 0.1 * self.motion_mean  # std of motion length
+        self.motion_max = 2.0 * self.motion_mean
+        self.puddle_centers = []
+        self.puddle_radii = []
+        self._build_puddle(np.array([0.2, 0.4]), 0.1)
+        self._build_puddle(np.array([0.5, 0.8]), 0.1)
+        self._build_puddle(np.array([0.9, 0.1]), 0.1)
+        self.num_puddles = len(self.puddle_centers)
+        self.puddle_cost = 2.0
+
+    def _compute_reward(self, state):
+        reward = -1
+        for i in range(self.num_puddles):
+            delta = state - self.puddle_centers[i]
+            dist = np.dot(delta, delta)
+            if dist <= self.puddle_radii[i]:
+                reward -= self.puddle_cost
+        return reward
+
+    def _is_goal(self, state):
+        return state[0] >= self.goal_state[0] - self.goal_width and \
+            state[1] >= self.goal_state[1] - self.goal_width
+
+    def _build_puddle(self, center, radius):
+        self.puddle_centers.append(center)
+        self.puddle_radii.append(radius)
+
+    def _render(self, mode='human', close=False):
+        pass
diff --git a/Examples/ReinforcementLearning/deeprl/scripts/config_examples/policy_gradient.config b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/policy_gradient.config
@@ -0,0 +1,35 @@
+# See cntk.contrib.deeprl.agent.shared.policy_gradient_parameters for detailed
+# explanation of each parameter.
+
+[General]
+Agent = actor_critic
+Gamma = 0.99
+# PreProcessing = cntk.contrib.deeprl.agent.shared.preprocessing.AtariPreprocessing
+# PreProcessingArgs = (4,)
+
+[PolicyGradient]
+SharedRepresentation = False
+# PolicyRepresentation/ValueFunctionRepresentation can be nn, or some
+# customized model defined as module_name.method_name, e.g.
+# PolicyRepresentation = cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn
+PolicyRepresentation = nn
+InitialPolicy =
+# ValueFunctionRepresentation is ignored when SharedRepresentation is true
+ValueFunctionRepresentation = nn
+UpdateFrequency = 32
+RelativeStepSize = 0.5
+RegularizationWeight = 0.001
+
+[NetworkModel]
+# Use (a list of integers) when PolicyRepresentation is nn
+PolicyNetworkHiddenLayerNodes = [20]
+
+# Use (a list of integers) when ValueFunctionRepresentation is nn, ignored when
+# SharedRepresentation is true
+ValueNetworkHiddenLayerNodes = [20]
+
+[Optimization]
+Momentum = 0.95
+InitialEta = 0.01
+EtaDecayStepCount = 10000
+EtaMinimum = 0.01
diff --git a/Examples/ReinforcementLearning/deeprl/scripts/config_examples/qlearning.config b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/qlearning.config
@@ -0,0 +1,46 @@
+# See cntk.contrib.deeprl.agent.shared.qlearning_parameters for detailed
+# explanation of each parameter.
+
+[General]
+Agent = qlearning
+Gamma = 0.99
+# PreProcessing = cntk.contrib.deeprl.agent.shared.preprocessing.AtariPreprocessing
+# PreProcessingArgs = (4,)
+
+[QLearningAlgo]
+InitialEpsilon = 1.0
+EpsilonDecayStepCount = 10000
+EpsilonMinimum = 0.01
+InitialQ = 0.0
+TargetQUpdateFrequency = 100
+QUpdateFrequency = 4
+MinibatchSize = 32
+# QRepresentation can be 'dqn', 'dueling-dqn', or some customized model defined as
+# module_name.method_name, e.g.
+# QRepresentation = cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn
+QRepresentation = dqn
+ErrorClipping = False
+ReplaysPerUpdate = 1
+
+[ExperienceReplay]
+Capacity = 500
+StartSize = 100
+Prioritized = True
+PriorityAlpha = 0.7
+PriorityBeta = 1
+PriorityEpsilon = 0.0001
+
+[NetworkModel]
+# Use (a list of integers) when QRepresentation is 'dqn'
+HiddenLayerNodes = [20]
+
+# Or use (a list of integers followed by two lists of integers) when
+# QRepresentation is 'dueling-dqn'
+; HiddenLayerNodes = [10, [5], [5]]
+
+[Optimization]
+Momentum = 0.9
+InitialEta = 0.01
+EtaDecayStepCount = 10000
+EtaMinimum = 0.0001
+GradientClippingThreshold = 10
diff --git a/Examples/ReinforcementLearning/deeprl/scripts/config_examples/tabular_qlearning.config b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/tabular_qlearning.config
@@ -0,0 +1,17 @@
+# See cntk.contrib.deeprl.agent.shared.qlearning_parameters for detailed
+# explanation of each parameter.
+
+[General]
+Agent = tabular_qlearning
+Gamma = 0.99
+
+[QLearningAlgo]
+InitialEpsilon = 1.0
+EpsilonDecayStepCount = 100000
+EpsilonMinimum = 0.01
+InitialEta = 0.5
+EtaDecayStepCount = 100000
+EtaMinimum = 0.1
+InitialQ = 0.0
+DiscretizationResolution = 10
+QRepresentation = tabular