Skip to content

Commit

Permalink
Merge branch 'master' into qiwye/lightrnn
Browse files Browse the repository at this point in the history
  • Loading branch information
chivee committed Jul 23, 2017
2 parents d5a18ba + 1e61898 commit 4a272fb
Show file tree
Hide file tree
Showing 68 changed files with 4,521 additions and 6 deletions.
41 changes: 41 additions & 0 deletions Examples/ReinforcementLearning/deeprl/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
Examples of running CNTK DeepRL toolkit.

Dependency:
- OpenAI Gym: https://gym.openai.com/docs
- Atari: https://github.com/openai/gym#atari
Use the following command to install Atari games on Windows:
pip install git+https://github.com/Kojoley/atari-py.git

The following commands assume Examples/ReinforcementLearning/deeprl/scripts as the working directory.

To train an agent using
- TabularQLearning
python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/tabular_qlearning.config --eval_period=1000 --eval_steps=20000

- QLearning
python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/qlearning.config --eval_period=1000 --eval_steps=20000

- ActorCritic
python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/policy_gradient.config --eval_period=1000 --eval_steps=20000

- RandomAgent
python run.py --env=CartPole-v0 --max_steps=100 --eval_period=1 --eval_steps=200000

Use QLearning as an example, the command
python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/qlearning.config --eval_period=1000 --eval_steps=20000
tells QLearning agent to interact with environment CartPole-v0 for a maximum of
100000 steps, while evaluation is done every 1000 steps. Each evaluation reports
average reward per episode by interacting with the environment 20000 steps.

The agent configs, best model and evaluation results are written to --output_dir,
which defaults to 'output' in the working directory. To view the evaluation
results, type the following command in python:

import shelve
d = shelve.open('output/output.wks')
d['reward_history']
d.close()

Note, reading and writing wks simultaneously will corrupt the file. To
check your results while the program is still running, make a copy of wks file
and read the numbers from the copy.
Empty file.
29 changes: 29 additions & 0 deletions Examples/ReinforcementLearning/deeprl/env/env_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) Microsoft. All rights reserved.

# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================

from gym import envs

from . import maze2d, puddleworld


def register_env(env_id):
if env_id == 'Maze2D-v0':
envs.register(
id=env_id,
entry_point='env:maze2d.Maze2D',
kwargs={},
max_episode_steps=200,
reward_threshold=-110.0)
elif env_id == 'PuddleWorld-v0':
envs.register(
id=env_id,
entry_point='env:puddleworld.PuddleWorld',
kwargs={},
max_episode_steps=200,
reward_threshold=-100.0)
else:
raise ValueError('Cannot find environment "{0}"\n'.format(env_id))
return True
95 changes: 95 additions & 0 deletions Examples/ReinforcementLearning/deeprl/env/maze2d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright (c) Microsoft. All rights reserved.

# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================

import gym
import numpy as np
from gym import spaces
from gym.utils import seeding


class Maze2D(gym.Env):
"""This class creates a maze problem given a map."""

metadata = {
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second': 30
}

def __init__(self):
self._load_map()
self.viewer = None
self.action_space = spaces.Discrete(4)
self.observation_space = spaces.Discrete(self.room_lengths[0] *
self.room_lengths[1])
self._seed()
self._reset()

def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]

def _step(self, action):
assert self.action_space.contains(action), "%r (%s) invalid" % (
action, type(action))

if (np.random.uniform(0, 1) > self.motion_noise):
state0 = self.state[0]
state1 = self.state[1]
if action == 0: # north
state1 = np.minimum(self.room_lengths[1] - 1, state1 + 1)
elif action == 1: # east
state0 = np.minimum(self.room_lengths[0] - 1, state0 + 1)
elif action == 2: # south
state1 = np.maximum(0, state1 - 1)
else: # west
state0 = np.maximum(0, state0 - 1)
if not ([state0, state1] in self.wall_states):
self.state[0] = state0
self.state[1] = state1

done = self._is_goal(self.state)
reward = -1.0
return self._encode_state(self.state), reward, done, {}

def _reset(self):
rnd_index = np.random.randint(0, len(self.initial_states))
self.state = self.initial_states[rnd_index][:]
return self._encode_state(self.state)

def _load_map(self):
self.room_lengths = np.array([25, 25])
self.initial_states = [[0, 0]]
self.goal_states = [[24, 24]]
self.wall_states = []
self._build_wall([2, 0], [2, 15])
self._build_wall([5, 10], [5, 20])
self._build_wall([5, 12], [13, 12])
self._build_wall([15, 5], [15, 24])
self._build_wall([10, 5], [22, 5])
self.num_states = self.room_lengths[0] * self.room_lengths[1]
self.motion_noise = 0.05

def _is_goal(self, state):
return self.state in self.goal_states

def _encode_state(self, state):
return int(state[1] * self.room_lengths[0] + state[0])

def _build_wall(self, start, end):
x_min = np.maximum(0, np.minimum(start[0], end[0]))
x_max = np.minimum(self.room_lengths[0] - 1,
np.maximum(start[0], end[0]))
y_min = np.maximum(0, np.minimum(start[1], end[1]))
y_max = np.minimum(self.room_lengths[1] - 1,
np.maximum(start[1], end[1]))
for x in range(x_min, x_max + 1):
for y in range(y_min, y_max + 1):
if not ([x, y] in self.goal_states or
[x, y] in self.initial_states):
self.wall_states.append([x, y])

def _render(self, mode='human', close=False):
pass
102 changes: 102 additions & 0 deletions Examples/ReinforcementLearning/deeprl/env/puddleworld.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright (c) Microsoft. All rights reserved.

# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================

import gym
import numpy as np
from gym import spaces
from gym.utils import seeding


class PuddleWorld(gym.Env):
"""This class creates a continous-state maze problem given a map."""

metadata = {
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second': 30
}

def __init__(self):
self._load_map()
self.viewer = None
self.action_space = spaces.Discrete(4)
self.observation_space = spaces.Box(np.zeros(2), self.room_lengths)
self._seed()
self._reset()

def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]

def _step(self, action):
assert self.action_space.contains(action), "%r (%s) invalid" % (
action, type(action))

if (np.random.uniform(0., 1.) > self.motion_noise):
state0 = self.state[0]
state1 = self.state[1]
# Motion length is a truncated normal random variable.
motion_length = np.maximum(
0.,
np.minimum(
self.motion_max,
np.random.normal(self.motion_mean, self.motion_std)))
if action == 0: # north
state1 = np.minimum(self.room_lengths[1],
state1 + motion_length)
elif action == 1: # east
state0 = np.minimum(self.room_lengths[0],
state0 + motion_length)
elif action == 2: # south
state1 = np.maximum(0., state1 - motion_length)
else: # west
state0 = np.maximum(0., state0 - motion_length)
self.state[0] = state0
self.state[1] = state1

done = self._is_goal(self.state)
reward = self._compute_reward(self.state)
return self.state, reward, done, {}

def _reset(self):
self.state = np.copy(self.initial_state)
return self.state

def _load_map(self):
self.room_lengths = np.array([1., 1.])
self.initial_state = np.array([0., 0.])
self.goal_state = np.array([1., 1.])
self.goal_width = 0.01
self.motion_noise = 0.05 # probability of no-motion (staying in same state)
self.motion_mean = 0.1 # mean of motion length
self.motion_std = 0.1 * self.motion_mean # std of motion length
self.motion_max = 2.0 * self.motion_mean
self.puddle_centers = []
self.puddle_radii = []
self._build_puddle(np.array([0.2, 0.4]), 0.1)
self._build_puddle(np.array([0.5, 0.8]), 0.1)
self._build_puddle(np.array([0.9, 0.1]), 0.1)
self.num_puddles = len(self.puddle_centers)
self.puddle_cost = 2.0

def _compute_reward(self, state):
reward = -1
for i in range(self.num_puddles):
delta = state - self.puddle_centers[i]
dist = np.dot(delta, delta)
if dist <= self.puddle_radii[i]:
reward -= self.puddle_cost
return reward

def _is_goal(self, state):
return state[0] >= self.goal_state[0] - self.goal_width and \
state[1] >= self.goal_state[1] - self.goal_width

def _build_puddle(self, center, radius):
self.puddle_centers.append(center)
self.puddle_radii.append(radius)

def _render(self, mode='human', close=False):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# See cntk.contrib.deeprl.agent.shared.policy_gradient_parameters for detailed
# explanation of each parameter.

[General]
Agent = actor_critic
Gamma = 0.99
# PreProcessing = cntk.contrib.deeprl.agent.shared.preprocessing.AtariPreprocessing
# PreProcessingArgs = (4,)

[PolicyGradient]
SharedRepresentation = False
# PolicyRepresentation/ValueFunctionRepresentation can be nn, or some
# customized model defined as module_name.method_name, e.g.
# PolicyRepresentation = cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn
PolicyRepresentation = nn
InitialPolicy =
# ValueFunctionRepresentation is ignored when SharedRepresentation is true
ValueFunctionRepresentation = nn
UpdateFrequency = 32
RelativeStepSize = 0.5
RegularizationWeight = 0.001

[NetworkModel]
# Use (a list of integers) when PolicyRepresentation is nn
PolicyNetworkHiddenLayerNodes = [20]

# Use (a list of integers) when ValueFunctionRepresentation is nn, ignored when
# SharedRepresentation is true
ValueNetworkHiddenLayerNodes = [20]

[Optimization]
Momentum = 0.95
InitialEta = 0.01
EtaDecayStepCount = 10000
EtaMinimum = 0.01
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# See cntk.contrib.deeprl.agent.shared.qlearning_parameters for detailed
# explanation of each parameter.

[General]
Agent = qlearning
Gamma = 0.99
# PreProcessing = cntk.contrib.deeprl.agent.shared.preprocessing.AtariPreprocessing
# PreProcessingArgs = (4,)

[QLearningAlgo]
InitialEpsilon = 1.0
EpsilonDecayStepCount = 10000
EpsilonMinimum = 0.01
InitialQ = 0.0
TargetQUpdateFrequency = 100
QUpdateFrequency = 4
MinibatchSize = 32
# QRepresentation can be 'dqn', 'dueling-dqn', or some customized model defined as
# module_name.method_name, e.g.
# QRepresentation = cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn
QRepresentation = dqn
ErrorClipping = False
ReplaysPerUpdate = 1

[ExperienceReplay]
Capacity = 500
StartSize = 100
Prioritized = True
PriorityAlpha = 0.7
PriorityBeta = 1
PriorityEpsilon = 0.0001

[NetworkModel]
# Use (a list of integers) when QRepresentation is 'dqn'
HiddenLayerNodes = [20]

# Or use (a list of integers followed by two lists of integers) when
# QRepresentation is 'dueling-dqn'
; HiddenLayerNodes = [10, [5], [5]]

[Optimization]
Momentum = 0.9
InitialEta = 0.01
EtaDecayStepCount = 10000
EtaMinimum = 0.0001
GradientClippingThreshold = 10
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# See cntk.contrib.deeprl.agent.shared.qlearning_parameters for detailed
# explanation of each parameter.

[General]
Agent = tabular_qlearning
Gamma = 0.99

[QLearningAlgo]
InitialEpsilon = 1.0
EpsilonDecayStepCount = 100000
EpsilonMinimum = 0.01
InitialEta = 0.5
EtaDecayStepCount = 100000
EtaMinimum = 0.1
InitialQ = 0.0
DiscretizationResolution = 10
QRepresentation = tabular
Loading

0 comments on commit 4a272fb

Please sign in to comment.