diff --git a/Examples/ReinforcementLearning/deeprl/README.md b/Examples/ReinforcementLearning/deeprl/README.md
new file mode 100644
index 000000000000..74ec516f7c49
--- /dev/null
+++ b/Examples/ReinforcementLearning/deeprl/README.md
@@ -0,0 +1,41 @@
+Examples of running CNTK DeepRL toolkit.
+
+Dependency:
+    - OpenAI Gym: https://gym.openai.com/docs
+    - Atari: https://github.com/openai/gym#atari
+             Use the following command to install Atari games on Windows:
+                pip install git+https://github.com/Kojoley/atari-py.git
+
+The following commands assume Examples/ReinforcementLearning/deeprl/scripts as the working directory.
+
+To train an agent using
+    - TabularQLearning
+    python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/tabular_qlearning.config --eval_period=1000 --eval_steps=20000
+
+    - QLearning
+    python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/qlearning.config --eval_period=1000 --eval_steps=20000
+
+    - ActorCritic
+    python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/policy_gradient.config --eval_period=1000 --eval_steps=20000
+
+    - RandomAgent
+    python run.py --env=CartPole-v0 --max_steps=100 --eval_period=1 --eval_steps=200000
+
+Use QLearning as an example, the command
+    python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/qlearning.config --eval_period=1000 --eval_steps=20000
+tells QLearning agent to interact with environment CartPole-v0 for a maximum of
+100000 steps, while evaluation is done every 1000 steps. Each evaluation reports
+average reward per episode by interacting with the environment 20000 steps.
+
+The agent configs, best model and evaluation results are written to --output_dir,
+which defaults to 'output' in the working directory. To view the evaluation
+results, type the following command in python:
+
+import shelve
+d = shelve.open('output/output.wks')
+d['reward_history']
+d.close()
+
+Note, reading and writing wks simultaneously will corrupt the file. To
+check your results while the program is still running, make a copy of wks file
+and read the numbers from the copy.
diff --git a/Examples/ReinforcementLearning/deeprl/env/__init__.py b/Examples/ReinforcementLearning/deeprl/env/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/Examples/ReinforcementLearning/deeprl/env/env_factory.py b/Examples/ReinforcementLearning/deeprl/env/env_factory.py
new file mode 100644
index 000000000000..f1040870685c
--- /dev/null
+++ b/Examples/ReinforcementLearning/deeprl/env/env_factory.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+from gym import envs
+
+from . import maze2d, puddleworld
+
+
+def register_env(env_id):
+    if env_id == 'Maze2D-v0':
+        envs.register(
+            id=env_id,
+            entry_point='env:maze2d.Maze2D',
+            kwargs={},
+            max_episode_steps=200,
+            reward_threshold=-110.0)
+    elif env_id == 'PuddleWorld-v0':
+        envs.register(
+            id=env_id,
+            entry_point='env:puddleworld.PuddleWorld',
+            kwargs={},
+            max_episode_steps=200,
+            reward_threshold=-100.0)
+    else:
+        raise ValueError('Cannot find environment "{0}"\n'.format(env_id))
+    return True
diff --git a/Examples/ReinforcementLearning/deeprl/env/maze2d.py b/Examples/ReinforcementLearning/deeprl/env/maze2d.py
new file mode 100644
index 000000000000..e9a957d65dca
--- /dev/null
+++ b/Examples/ReinforcementLearning/deeprl/env/maze2d.py
@@ -0,0 +1,95 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import gym
+import numpy as np
+from gym import spaces
+from gym.utils import seeding
+
+
+class Maze2D(gym.Env):
+    """This class creates a maze problem given a map."""
+
+    metadata = {
+        'render.modes': ['human', 'rgb_array'],
+        'video.frames_per_second': 30
+    }
+
+    def __init__(self):
+        self._load_map()
+        self.viewer = None
+        self.action_space = spaces.Discrete(4)
+        self.observation_space = spaces.Discrete(self.room_lengths[0] *
+                                                 self.room_lengths[1])
+        self._seed()
+        self._reset()
+
+    def _seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def _step(self, action):
+        assert self.action_space.contains(action), "%r (%s) invalid" % (
+            action, type(action))
+
+        if (np.random.uniform(0, 1) > self.motion_noise):
+            state0 = self.state[0]
+            state1 = self.state[1]
+            if action == 0:  # north
+                state1 = np.minimum(self.room_lengths[1] - 1, state1 + 1)
+            elif action == 1:  # east
+                state0 = np.minimum(self.room_lengths[0] - 1, state0 + 1)
+            elif action == 2:  # south
+                state1 = np.maximum(0, state1 - 1)
+            else:  # west
+                state0 = np.maximum(0, state0 - 1)
+            if not ([state0, state1] in self.wall_states):
+                self.state[0] = state0
+                self.state[1] = state1
+
+        done = self._is_goal(self.state)
+        reward = -1.0
+        return self._encode_state(self.state), reward, done, {}
+
+    def _reset(self):
+        rnd_index = np.random.randint(0, len(self.initial_states))
+        self.state = self.initial_states[rnd_index][:]
+        return self._encode_state(self.state)
+
+    def _load_map(self):
+        self.room_lengths = np.array([25, 25])
+        self.initial_states = [[0, 0]]
+        self.goal_states = [[24, 24]]
+        self.wall_states = []
+        self._build_wall([2, 0], [2, 15])
+        self._build_wall([5, 10], [5, 20])
+        self._build_wall([5, 12], [13, 12])
+        self._build_wall([15, 5], [15, 24])
+        self._build_wall([10, 5], [22, 5])
+        self.num_states = self.room_lengths[0] * self.room_lengths[1]
+        self.motion_noise = 0.05
+
+    def _is_goal(self, state):
+        return self.state in self.goal_states
+
+    def _encode_state(self, state):
+        return int(state[1] * self.room_lengths[0] + state[0])
+
+    def _build_wall(self, start, end):
+        x_min = np.maximum(0, np.minimum(start[0], end[0]))
+        x_max = np.minimum(self.room_lengths[0] - 1,
+                           np.maximum(start[0], end[0]))
+        y_min = np.maximum(0, np.minimum(start[1], end[1]))
+        y_max = np.minimum(self.room_lengths[1] - 1,
+                           np.maximum(start[1], end[1]))
+        for x in range(x_min, x_max + 1):
+            for y in range(y_min, y_max + 1):
+                if not ([x, y] in self.goal_states or
+                        [x, y] in self.initial_states):
+                    self.wall_states.append([x, y])
+
+    def _render(self, mode='human', close=False):
+        pass
diff --git a/Examples/ReinforcementLearning/deeprl/env/puddleworld.py b/Examples/ReinforcementLearning/deeprl/env/puddleworld.py
new file mode 100644
index 000000000000..18b8eb89f155
--- /dev/null
+++ b/Examples/ReinforcementLearning/deeprl/env/puddleworld.py
@@ -0,0 +1,102 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import gym
+import numpy as np
+from gym import spaces
+from gym.utils import seeding
+
+
+class PuddleWorld(gym.Env):
+    """This class creates a continous-state maze problem given a map."""
+
+    metadata = {
+        'render.modes': ['human', 'rgb_array'],
+        'video.frames_per_second': 30
+    }
+
+    def __init__(self):
+        self._load_map()
+        self.viewer = None
+        self.action_space = spaces.Discrete(4)
+        self.observation_space = spaces.Box(np.zeros(2), self.room_lengths)
+        self._seed()
+        self._reset()
+
+    def _seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def _step(self, action):
+        assert self.action_space.contains(action), "%r (%s) invalid" % (
+            action, type(action))
+
+        if (np.random.uniform(0., 1.) > self.motion_noise):
+            state0 = self.state[0]
+            state1 = self.state[1]
+            # Motion length is a truncated normal random variable.
+            motion_length = np.maximum(
+                0.,
+                np.minimum(
+                    self.motion_max,
+                    np.random.normal(self.motion_mean, self.motion_std)))
+            if action == 0:  # north
+                state1 = np.minimum(self.room_lengths[1],
+                                    state1 + motion_length)
+            elif action == 1:  # east
+                state0 = np.minimum(self.room_lengths[0],
+                                    state0 + motion_length)
+            elif action == 2:  # south
+                state1 = np.maximum(0., state1 - motion_length)
+            else:  # west
+                state0 = np.maximum(0., state0 - motion_length)
+            self.state[0] = state0
+            self.state[1] = state1
+
+        done = self._is_goal(self.state)
+        reward = self._compute_reward(self.state)
+        return self.state, reward, done, {}
+
+    def _reset(self):
+        self.state = np.copy(self.initial_state)
+        return self.state
+
+    def _load_map(self):
+        self.room_lengths = np.array([1., 1.])
+        self.initial_state = np.array([0., 0.])
+        self.goal_state = np.array([1., 1.])
+        self.goal_width = 0.01
+        self.motion_noise = 0.05  # probability of no-motion (staying in same state)
+        self.motion_mean = 0.1  # mean of motion length
+        self.motion_std = 0.1 * self.motion_mean  # std of motion length
+        self.motion_max = 2.0 * self.motion_mean
+        self.puddle_centers = []
+        self.puddle_radii = []
+        self._build_puddle(np.array([0.2, 0.4]), 0.1)
+        self._build_puddle(np.array([0.5, 0.8]), 0.1)
+        self._build_puddle(np.array([0.9, 0.1]), 0.1)
+        self.num_puddles = len(self.puddle_centers)
+        self.puddle_cost = 2.0
+
+    def _compute_reward(self, state):
+        reward = -1
+        for i in range(self.num_puddles):
+            delta = state - self.puddle_centers[i]
+            dist = np.dot(delta, delta)
+            if dist <= self.puddle_radii[i]:
+                reward -= self.puddle_cost
+        return reward
+
+    def _is_goal(self, state):
+        return state[0] >= self.goal_state[0] - self.goal_width and \
+            state[1] >= self.goal_state[1] - self.goal_width
+
+    def _build_puddle(self, center, radius):
+        self.puddle_centers.append(center)
+        self.puddle_radii.append(radius)
+
+    def _render(self, mode='human', close=False):
+        pass
diff --git a/Examples/ReinforcementLearning/deeprl/scripts/config_examples/policy_gradient.config b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/policy_gradient.config
new file mode 100644
index 000000000000..218b7db8b20c
--- /dev/null
+++ b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/policy_gradient.config
@@ -0,0 +1,35 @@
+# See cntk.contrib.deeprl.agent.shared.policy_gradient_parameters for detailed
+# explanation of each parameter.
+
+[General]
+Agent = actor_critic
+Gamma = 0.99
+# PreProcessing = cntk.contrib.deeprl.agent.shared.preprocessing.AtariPreprocessing
+# PreProcessingArgs = (4,)
+
+[PolicyGradient]
+SharedRepresentation = False
+# PolicyRepresentation/ValueFunctionRepresentation can be nn, or some
+# customized model defined as module_name.method_name, e.g.
+# PolicyRepresentation = cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn
+PolicyRepresentation = nn
+InitialPolicy =
+# ValueFunctionRepresentation is ignored when SharedRepresentation is true
+ValueFunctionRepresentation = nn
+UpdateFrequency = 32
+RelativeStepSize = 0.5
+RegularizationWeight = 0.001
+
+[NetworkModel]
+# Use (a list of integers) when PolicyRepresentation is nn
+PolicyNetworkHiddenLayerNodes = [20]
+
+# Use (a list of integers) when ValueFunctionRepresentation is nn, ignored when
+# SharedRepresentation is true
+ValueNetworkHiddenLayerNodes = [20]
+
+[Optimization]
+Momentum = 0.95
+InitialEta = 0.01
+EtaDecayStepCount = 10000
+EtaMinimum = 0.01
diff --git a/Examples/ReinforcementLearning/deeprl/scripts/config_examples/qlearning.config b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/qlearning.config
new file mode 100644
index 000000000000..9d3b692c54bd
--- /dev/null
+++ b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/qlearning.config
@@ -0,0 +1,46 @@
+# See cntk.contrib.deeprl.agent.shared.qlearning_parameters for detailed
+# explanation of each parameter.
+
+[General]
+Agent = qlearning
+Gamma = 0.99
+# PreProcessing = cntk.contrib.deeprl.agent.shared.preprocessing.AtariPreprocessing
+# PreProcessingArgs = (4,)
+
+[QLearningAlgo]
+InitialEpsilon = 1.0
+EpsilonDecayStepCount = 10000
+EpsilonMinimum = 0.01
+InitialQ = 0.0
+TargetQUpdateFrequency = 100
+QUpdateFrequency = 4
+MinibatchSize = 32
+# QRepresentation can be 'dqn', 'dueling-dqn', or some customized model defined as
+# module_name.method_name, e.g.
+# QRepresentation = cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn
+QRepresentation = dqn
+ErrorClipping = False
+ReplaysPerUpdate = 1
+
+[ExperienceReplay]
+Capacity = 500
+StartSize = 100
+Prioritized = True
+PriorityAlpha = 0.7
+PriorityBeta = 1
+PriorityEpsilon = 0.0001
+
+[NetworkModel]
+# Use (a list of integers) when QRepresentation is 'dqn'
+HiddenLayerNodes = [20]
+
+# Or use (a list of integers followed by two lists of integers) when
+# QRepresentation is 'dueling-dqn'
+; HiddenLayerNodes = [10, [5], [5]]
+
+[Optimization]
+Momentum = 0.9
+InitialEta = 0.01
+EtaDecayStepCount = 10000
+EtaMinimum = 0.0001
+GradientClippingThreshold = 10
diff --git a/Examples/ReinforcementLearning/deeprl/scripts/config_examples/tabular_qlearning.config b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/tabular_qlearning.config
new file mode 100644
index 000000000000..1367ae374867
--- /dev/null
+++ b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/tabular_qlearning.config
@@ -0,0 +1,17 @@
+# See cntk.contrib.deeprl.agent.shared.qlearning_parameters for detailed
+# explanation of each parameter.
+
+[General]
+Agent = tabular_qlearning
+Gamma = 0.99
+
+[QLearningAlgo]
+InitialEpsilon = 1.0
+EpsilonDecayStepCount = 100000
+EpsilonMinimum = 0.01
+InitialEta = 0.5
+EtaDecayStepCount = 100000
+EtaMinimum = 0.1
+InitialQ = 0.0
+DiscretizationResolution = 10
+QRepresentation = tabular
diff --git a/Examples/ReinforcementLearning/deeprl/scripts/run.py b/Examples/ReinforcementLearning/deeprl/scripts/run.py
new file mode 100644
index 000000000000..50238dea1434
--- /dev/null
+++ b/Examples/ReinforcementLearning/deeprl/scripts/run.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import argparse
+import os
+import shelve
+import sys
+import time
+from contextlib import closing
+
+import numpy as np
+from gym import envs
+from gym.envs.atari.atari_env import AtariEnv
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+from cntk.contrib.deeprl.agent import agent_factory
+from env import env_factory
+
+
+def new_episode():
+    """Start a new episode.
+
+    For Atari games, perform no-op actions at the beginning of the episode.
+    """
+    observation = env.reset()
+    if args.render:
+        env.render()
+    if isinstance(env.env, AtariEnv):
+        for t in range(args.num_noop):
+            observation, reward, isTerminal, _ = env.step(0)
+            if isTerminal:
+                print('WARNING: Terminal signal received after {0} steps'
+                      ''.format(t))
+            if args.render:
+                env.render()
+    return observation
+
+
+def evaluate_agent_if_necessary(eval_count, start_time):
+    """Evaluate agent every --eval_period steps."""
+    if agent.step_count >= eval_count * args.eval_period:
+        elapsed_time = time.time() - start_time
+        total_reward = 0
+        num_episodes = 0
+        episode_reward = 0
+        i = 0
+        agent.enter_evaluation()
+
+        observation = new_episode()
+        while i < args.eval_steps:
+            i += 1
+            action = agent.evaluate(observation)
+            observation, reward, isTerminal, _ = env.step(action)
+            if args.render:
+                env.render()
+            episode_reward += reward
+            if isTerminal:
+                num_episodes += 1
+                total_reward += episode_reward
+                episode_reward = 0
+                observation = new_episode()
+
+        reward = episode_reward if num_episodes == 0 \
+            else total_reward / num_episodes
+        print('\nAverage reward per episode after training {0} steps: {1}\n'
+              ''.format(agent.step_count, reward))
+        if len(reward_history) == 0 or reward > max(reward_history):
+            agent.set_as_best_model()
+        reward_history.append(reward)
+        if len(training_time) != 0:
+            elapsed_time += training_time[-1]
+        training_time.append(elapsed_time)
+
+        # Save results and update eval_count.
+        filename_prefix = os.path.join(args.output_dir, args.output_dir)
+        agent.save(filename_prefix + '.model')
+        with closing(shelve.open(filename_prefix + '.wks',
+                                 'n' if eval_count == 1 else 'c',
+                                 0,
+                                 True)) as shelf:
+            if 'step_count' not in shelf:
+                shelf['step_count'] = []
+            shelf['step_count'].append(agent.step_count)
+            shelf['reward_history'] = reward_history
+            shelf['training_time_sec'] = training_time
+        agent.exit_evaluation()
+        eval_count += 1
+        start_time = time.time()
+
+    return eval_count, start_time
+
+
+if __name__ == '__main__':
+    # Parse input arguments.
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--env', type=str, default='CartPole-v0',
+                        help='Environment that agent iteracts with.')
+    parser.add_argument('--num_noop', type=int, default=30, help='Number of '
+                        'no-op actions to be performed by the agent at the '
+                        'start of an episode, for Atari environment only.')
+    parser.add_argument('--agent_config', type=str, default='',
+                        help='Config file for agent.')
+    parser.add_argument('--max_steps', type=int, default=1000000,
+                        help='Maximum steps to train an agent.')
+    parser.add_argument('--max_episode_steps', type=int, default=0,
+                        help='Maximum steps per episode. Use environment '
+                        'specific value if 0.')
+    parser.add_argument('--eval_period', type=int, default=250000,
+                        help='Number of steps taken between each evaluation.')
+    parser.add_argument('--eval_steps', type=int, default=125000,
+                        help='Number of steps taken during each evaluation.')
+    parser.add_argument('--verbose', action='store_true', help='Output debug '
+                        'info if set to True.')
+    parser.add_argument('--output_dir', type=str, default='output',
+                        help='Directory where workspace file and model file '
+                        'are saved to. Model file will be named as '
+                        'output_dir.model, and workspace file will be named '
+                        'as output_dir.wks.')
+    parser.add_argument('--render', action='store_true', help='Render '
+                        'environment if set to True.')
+    parser.add_argument('--seed', type=int, default=1234567, help='Seed for '
+                        'random number generator. Negative value is ignored.')
+    args = parser.parse_args()
+
+    if (args.seed >= 0):
+        np.random.seed(args.seed)
+
+    # Use xrange for python 2.7 to speed up.
+    if sys.version_info.major < 3:
+        range = xrange
+
+    # Create an OpenAI Gym environment, and obtain its state/action
+    # information.
+    if args.env not in envs.registry.env_specs.keys():
+        # Try to find from local environment libraries.
+        env_factory.register_env(args.env)
+    env = envs.make(args.env)
+    o_space = env.observation_space
+    a_space = env.action_space
+    image_observation = True if isinstance(
+        env.env, AtariEnv) and env.env._obs_type == 'image' else False
+    print("Loaded environment '{0}'".format(args.env))
+    print("Observation space: '{0}'".format(o_space))
+    print("Action space: '{0}'".format(a_space))
+    print('Is observation an image: {0}'.format(image_observation))
+
+    if args.max_episode_steps <= 0:
+        args.max_episode_steps = \
+            env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']
+
+    # Create an agent.
+    agent = agent_factory.make_agent(args.agent_config,
+                                     o_space,
+                                     a_space)
+
+    # Create output folder, and save current parameter settings.
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    agent.save_parameter_settings(
+        os.path.join(args.output_dir, args.output_dir + '.params'))
+
+    eval_count = 1
+    reward_history = []
+    training_time = []
+    start_time = time.time()
+    # Stop when maximum number of steps are reached.
+    while agent.step_count < args.max_steps:
+        # Evaluate agent every --eval_period steps.
+        eval_count, start_time = evaluate_agent_if_necessary(
+            eval_count, start_time)
+        # Learn from new episode.
+        observation = new_episode()
+        action, debug_info = agent.start(observation)
+        rewards = 0
+        steps = 0
+        for t in range(args.max_episode_steps):
+            observation, reward, isTerminal, _ = env.step(action)
+            if args.render:
+                env.render()
+            if args.verbose:
+                print('\tStep\t{0}\t/\tAction\t{1},{2}\t/\tReward\t{3}'
+                      ''.format(
+                        agent.step_count,
+                        action,
+                        debug_info.get('action_behavior'),
+                        reward))
+            rewards += reward
+            steps += 1
+            if isTerminal:
+                agent.end(reward, observation)
+                break
+            action, debug_info = agent.step(reward, observation)
+        print('Episode {0}\t{1}/{2} steps\t{3} total reward\tterminated = {4}'
+              ''.format(
+                agent.episode_count, steps, agent.step_count, rewards, isTerminal))
+        sys.stdout.flush()
+    env.close()
diff --git a/Source/ActionsLib/NetworkDescriptionLanguage.cpp b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
index 63b00e11f56a..7094dd8656a2 100644
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@@ -176,6 +176,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
     else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true;
     else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceNode), L"CosDist")) ret = true;
     else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceWithNegativeSamplesNode), L"CosWithNegSamples")) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(CoshNode), L"Cosh")) ret = true;
     else if (EqualInsensitive(nodeType, OperationNameOf(CosineNode), L"Cos")) ret = true;
     else if (EqualInsensitive(nodeType, OperationNameOf(CrossEntropyNode))) ret = true;
     else if (EqualInsensitive(nodeType, OperationNameOf(CrossEntropyWithSoftmaxNode), L"CEWithSM")) ret = true;
@@ -224,6 +225,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
 #endif
     else if (EqualInsensitive(nodeType, OperationNameOf(SequenceWithSoftmaxNode), L"SEWithSM")) ret = true;
     else if (EqualInsensitive(nodeType, OperationNameOf(SigmoidNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(SinhNode))) ret = true;
     else if (EqualInsensitive(nodeType, OperationNameOf(SinNode))) ret = true;
     else if (EqualInsensitive(nodeType, OperationNameOf(SoftmaxNode))) ret = true;
     else if (EqualInsensitive(nodeType, OperationNameOf(SparseInputValue), L"SparseInput")) ret = true;
diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
index 4dff9b81a895..2c94913c24e6 100644
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -645,6 +645,7 @@ ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new Computa
 // TODO: ColumnElementTimes = ElementTimes
 CosDistance(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'CosDistance' ; inputs = _AsNodes (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
 CosDistanceWithNegativeSamples(aVectorSequence, anotherVectorSequence, numShifts, numNegSamples, tag='') = new ComputationNode [ operation = 'CosDistanceWithNegativeSamples' ; inputs = _AsNodes (aVectorSequence : anotherVectorSequence : numShifts : numNegSamples) /*plus the function args*/ ]
+Cosh(x, tag='') = new ComputationNode [ operation = 'Cosh' ; inputs = _AsNodes (x) /*plus the function args*/ ]
 Cosine(x, tag='') = new ComputationNode [ operation = 'Cosine' ; inputs = _AsNodes (x) /*plus the function args*/ ]
 CrossEntropy(refProbVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropy' ; inputs = _AsNodes (refProbVectorSequence : outProbVectorSequence) /*plus the function args*/ ]
 DiagTimes(diagonalMatrixAsColumnVector, matrix, tag='') = new ComputationNode [ operation = 'DiagTimes' ; inputs = _AsNodes (diagonalMatrixAsColumnVector : matrix) /*plus the function args*/ ]
@@ -674,6 +675,7 @@ Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = '
 # TODO: Scale = ElementTimes
 ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = _AsNodes (cond : indexSequence : sourceData) /*plus the function args*/ ]
 Sin(z, tag='') = new ComputationNode [ operation = 'Sin' ; inputs = _AsNodes (z) /*plus the function args*/ ]
+Sinh(x, tag='') = new ComputationNode [ operation = 'Sinh' ; inputs = _AsNodes (x) /*plus the function args*/ ]
 Softmax (z, axis=0, tag='') =  # TODO: replace this with more efficient version below once we have ReduceLogSum
     if axis == 0 then new ComputationNode [ operation = 'Softmax' ; inputs = _AsNodes (z) /*plus the function args*/ ]
     else
diff --git a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
index b1cd71455be8..f0273b47c8df 100644
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@@ -3582,6 +3582,16 @@ namespace CNTK
     ///
     CNTK_API FunctionPtr Cos(const Variable& operand, const std::wstring& name = L"");
 
+    ///
+    /// Create an instance of the CNTK built-in elementwise cosh operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Cosh(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise sinh operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Sinh(const Variable& operand, const std::wstring& name = L"");
+
     ///
     /// Create an instance of the CNTK built-in elementwise linear rectifier operation with the specified input operand.
     ///
diff --git a/Source/CNTKv2LibraryDll/BackCompat.cpp b/Source/CNTKv2LibraryDll/BackCompat.cpp
index d528c5c1b2b4..79a29bb5c11a 100644
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@@ -146,6 +146,10 @@ namespace CNTK
                     opType = PrimitiveOpType::Cos;
                 else if (node->OperationName() == OperationNameOf(SinNode))
                     opType = PrimitiveOpType::Sin;
+                else if (node->OperationName() == OperationNameOf(CoshNode))
+                    opType = PrimitiveOpType::Cosh;
+                else if (node->OperationName() == OperationNameOf(SinhNode))
+                    opType = PrimitiveOpType::Sinh;
                 else if (node->OperationName() == OperationNameOf(PassNode))
                     opType = PrimitiveOpType::Pass;
                 else if (node->OperationName() == OperationNameOf(LabelsToGraphNode))
diff --git a/Source/CNTKv2LibraryDll/CompositeFunction.cpp b/Source/CNTKv2LibraryDll/CompositeFunction.cpp
index cfc106f3be77..4c15309ec87b 100755
--- a/Source/CNTKv2LibraryDll/CompositeFunction.cpp
+++ b/Source/CNTKv2LibraryDll/CompositeFunction.cpp
@@ -628,6 +628,12 @@ namespace CNTK
                 case PrimitiveOpType::Sin:
                     computationNodePtr = New<SinNode<ElementType>>(network->GetDeviceId(), internalNodeName);
                     break;
+                case PrimitiveOpType::Cosh:
+                    computationNodePtr = New<CoshNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    break;
+                case PrimitiveOpType::Sinh:
+                    computationNodePtr = New<SinhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    break;
                 case PrimitiveOpType::ReLU:
                     computationNodePtr = New<RectifiedLinearNode<ElementType>>(network->GetDeviceId(), internalNodeName);
                     break;
diff --git a/Source/CNTKv2LibraryDll/Function.cpp b/Source/CNTKv2LibraryDll/Function.cpp
index 64594ecb4f9a..9960148152ac 100755
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@@ -1050,6 +1050,16 @@ namespace CNTK
         return UnaryOp(PrimitiveOpType::Cos, operand, Dictionary(), name);
     }
 
+    FunctionPtr Cosh(const Variable& operand, const std::wstring& name)
+    {
+        return UnaryOp(PrimitiveOpType::Cosh, operand, Dictionary(), name);
+    }
+
+    FunctionPtr Sinh(const Variable& operand, const std::wstring& name)
+    {
+        return UnaryOp(PrimitiveOpType::Sinh, operand, Dictionary(), name);
+    }
+
     FunctionPtr ReLU(const Variable& operand, const std::wstring& name)
     {
         return UnaryOp(PrimitiveOpType::ReLU, operand, Dictionary(), name);
diff --git a/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp b/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
index fb12b474d363..72b5c582591a 100644
--- a/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
+++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
@@ -363,6 +363,8 @@ namespace CNTK
                         case PrimitiveOpType::LogSoftmax:
                         case PrimitiveOpType::Sin:
                         case PrimitiveOpType::Cos:
+                        case PrimitiveOpType::Cosh:
+                        case PrimitiveOpType::Sinh:
                         case PrimitiveOpType::Pass:
                         case PrimitiveOpType::LabelsToGraph:
                         case PrimitiveOpType::StopGradient:
diff --git a/Source/CNTKv2LibraryDll/PrimitiveFunction.h b/Source/CNTKv2LibraryDll/PrimitiveFunction.h
index 36502515b835..9ebb3d4976e5 100644
--- a/Source/CNTKv2LibraryDll/PrimitiveFunction.h
+++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.h
@@ -84,6 +84,8 @@ namespace CNTK
         {PrimitiveOpType::CosDistance, L"CosDistance"},
         {PrimitiveOpType::Sin, L"Sin"},
         {PrimitiveOpType::Cos, L"Cos"},
+        {PrimitiveOpType::Cosh, L"Cosh"},
+        {PrimitiveOpType::Sinh, L"Sinh"},
         {PrimitiveOpType::Pass, L"Pass"},
         {PrimitiveOpType::Block, L"Block"},
         {PrimitiveOpType::Unpooling, L"Unpooling"},
diff --git a/Source/CNTKv2LibraryDll/PrimitiveOpType.h b/Source/CNTKv2LibraryDll/PrimitiveOpType.h
index 01b886648c0d..d4b9773286f0 100644
--- a/Source/CNTKv2LibraryDll/PrimitiveOpType.h
+++ b/Source/CNTKv2LibraryDll/PrimitiveOpType.h
@@ -86,6 +86,8 @@ namespace CNTK
         Gather = 74,
         StableSigmoid = 75,
         RandomDistribution = 76,
+        Sinh = 77,
+        Cosh = 78,
         // New op types should only be appended to the end of this list 
         UnknownOP
         // and UnknownOP should always be last.
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
index c8487eeaea9d..8ea0db5c8415 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@@ -45,6 +45,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
     else if (nodeType == OperationNameOf(ClipNode))                             return New<ClipNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(CosDistanceNode))                      return New<CosDistanceNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(CosDistanceWithNegativeSamplesNode))   return New<CosDistanceWithNegativeSamplesNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(CoshNode))                             return New<CoshNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(CosineNode))                           return New<CosineNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(CropNode))                             return New<CropNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(CrossEntropyNode))                     return New<CrossEntropyNode<ElemType>>(forward<_Types>(_Args)...);
@@ -116,6 +117,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
     else if (nodeType == OperationNameOf(SigmoidNode))                          return New<SigmoidNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(StableSigmoidNode))                    return New<StableSigmoidNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(SinNode))                              return New<SinNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(SinhNode))                             return New<SinhNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(SliceNode))                            return New<SliceNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(SoftmaxNode))                          return New<SoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(SqrtNode))                             return New<SqrtNode<ElemType>>(forward<_Types>(_Args)...);
@@ -645,6 +647,18 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Sin(c
     return net.AddNodeToNetAndAttachInputs(New<SinNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
 }
 
+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Cosh(const ComputationNodePtr a, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<CoshNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
+}
+
+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Sinh(const ComputationNodePtr a, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<SinhNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Abs(const ComputationNodePtr a, const std::wstring nodeName)
 {
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
index 5224ab6a7b2f..13b6ae9934e0 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@@ -124,6 +124,7 @@ class ComputationNetworkBuilder
     ComputationNodePtr Clip(const ComputationNodePtr a, const ComputationNodePtr b, const ComputationNodePtr c, const std::wstring nodeName = L"");
     ComputationNodePtr Cos(const ComputationNodePtr a, const std::wstring nodeName = L"");
     ComputationNodePtr CosDistance(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
+    ComputationNodePtr Cosh(const ComputationNodePtr a, const std::wstring nodeName = L"");
     ComputationNodePtr CrossEntropy(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
     ComputationNodePtr CrossEntropyWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
     ComputationNodePtr ForwardBackward(const ComputationNodePtr graph, const ComputationNodePtr features, int blankTokenId, int delayConstraint, const std::wstring nodeName = L"");
@@ -179,6 +180,7 @@ class ComputationNetworkBuilder
     ComputationNodePtr SequenceWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction, const ComputationNodePtr loglikelihood, const std::wstring nodeName = L"");
     ComputationNodePtr Sigmoid(const ComputationNodePtr a, const std::wstring nodeName = L"");
     ComputationNodePtr Sin(const ComputationNodePtr a, const std::wstring nodeName = L"");
+    ComputationNodePtr Sinh(const ComputationNodePtr a, const std::wstring nodeName = L"");
     ComputationNodePtr Softmax(const ComputationNodePtr a, const std::wstring nodeName = L"");
     ComputationNodePtr Sqrt(const ComputationNodePtr a, const std::wstring nodeName = L"");
     ComputationNodePtr SquareError(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h
index 09ec5acb25d9..501bf21ea316 100644
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@@ -114,6 +114,8 @@ class UnaryElementWiseWithOpCodeNodeBase : public ComputationNode<ElemType>, pub
 // FloorNode (input)
 // CosineNode (input)
 // SinNode (input)
+// CoshNode (input)
+// SinhNode (input)
 // Abs(input)
 // Negate (input)
 // Sqrt (input)
@@ -145,6 +147,7 @@ class UnaryElementWiseWithOpCodeNodeBase : public ComputationNode<ElemType>, pub
 
 //                                    Name                   Forward and            Backward opcodes                                                 Gradient optype
 DeclareUnaryElementWiseWithOpCodeNode(Abs,                   Abs,                   ElementwiseProductWithAbsDerivative,                             binaryWithInputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Cosh,                  Cosh,                  ElementwiseProductWithCoshDerivative,                             binaryWithInputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Cosine,                Cosine,                ElementwiseProductWithCosDerivative,                             binaryWithInputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Exp,                   Exp,                   ElementwiseProduct,                                              binaryWithOutputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Floor,                 Floor,                 None,                                                            noGradient);
@@ -156,6 +159,7 @@ DeclareUnaryElementWiseWithOpCodeNode(Reciprocal,            Reciprocal,
 DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear,       LinearRectifier,       ElementwiseProductWithLinearRectifierDerivativeFromOutput,       binaryWithOutputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Sigmoid,               Sigmoid,               ElementwiseProductWithSigmoidDerivativeFromOutput,               binaryWithOutputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Sin,                   Sin,                   ElementwiseProductWithSinDerivative,                             binaryWithInputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Sinh,                  Sinh,                  ElementwiseProductWithSinhDerivative,                             binaryWithInputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Sqrt,                  Sqrt,                  ElementwiseProductWithSqrtDerivative,                            binaryWithOutputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Tanh,                  Tanh,                  ElementwiseProductWithTanhDerivativeFromOutput,                  binaryWithOutputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(ExponentialLinearUnit, ExponentialLinearUnit, ElementwiseProductWithExponentialLinearUnitDerivativeFromOutput, binaryWithOutputGradient);
diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h
index 9c2b60a14fe6..f3204010fb7f 100755
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@@ -256,6 +256,12 @@ class MATH_API CPUMatrix : public BaseMatrix<ElemType>
     CPUMatrix<ElemType>& InplaceNegativeSine();
     CPUMatrix<ElemType>& AssignNegativeSineOf(const CPUMatrix<ElemType>& a);
 
+    CPUMatrix<ElemType>& InplaceCosh();
+    CPUMatrix<ElemType>& AssignCoshOf(const CPUMatrix<ElemType>& a);
+
+    CPUMatrix<ElemType>& InplaceSinh();
+    CPUMatrix<ElemType>& AssignSinhOf(const CPUMatrix<ElemType>& a);
+
     CPUMatrix<ElemType>& InplaceAbs();
     CPUMatrix<ElemType>& AssignAbsOf(const CPUMatrix<ElemType>& a);
 
diff --git a/Source/Math/CPUMatrixImpl.h b/Source/Math/CPUMatrixImpl.h
index 7d1ed50dc696..6ed53e616c5f 100644
--- a/Source/Math/CPUMatrixImpl.h
+++ b/Source/Math/CPUMatrixImpl.h
@@ -2726,6 +2726,60 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignNegativeSineOf(const CPUMatrix<E
     return *this;
 }
 
+//[this]=cosh([this]) element wise
+template <class ElemType>
+CPUMatrix<ElemType>& CPUMatrix<ElemType>::InplaceCosh()
+{
+    return AssignCoshOf(*this);
+}
+
+template <class ElemType>
+CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignCoshOf(const CPUMatrix<ElemType>& a)
+{
+    if (a.IsEmpty())
+        LogicError("AssignCoshOf: Matrix a is empty.");
+
+    auto& us = *this;
+    if (this != &a)
+        RequireSize(a.GetNumRows(), a.GetNumCols());
+
+#pragma omp parallel for
+    foreach_coord (i, j, a)
+    {
+        const ElemType v = a(i, j);
+        us(i, j) = cosh(v);
+    }
+
+    return *this;
+}
+
+//[this]=sinh([this]) element wise
+template <class ElemType>
+CPUMatrix<ElemType>& CPUMatrix<ElemType>::InplaceSinh()
+{
+    return AssignSinhOf(*this);
+}
+
+template <class ElemType>
+CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignSinhOf(const CPUMatrix<ElemType>& a)
+{
+    if (a.IsEmpty())
+        LogicError("AssignSinhOf: Matrix a is empty.");
+
+    auto& us = *this;
+    if (this != &a)
+        RequireSize(a.GetNumRows(), a.GetNumCols());
+
+#pragma omp parallel for
+    foreach_coord (i, j, a)
+    {
+        const ElemType v = a(i, j);
+        us(i, j) = sinh(v);
+    }
+
+    return *this;
+}
+
 //Threshold truncating: this[i] = max( this[i], threshold )
 template <class ElemType>
 CPUMatrix<ElemType>& CPUMatrix<ElemType>::InplaceTruncateBottom(const ElemType threshold)
diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h
index dfa07c8aa2c9..5e6d8a91b8a6 100644
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@@ -85,7 +85,7 @@ enum ElementWiseOperator
     // unary (or binary with constant parameter)
     opCopy,
     opNegate, opNot, opAbs, opFloor, opReciprocal,
-    opSigmoid, opTanh, opSqr, opSqrt, opExp, opLog, opLinearRectifier, opCosine, opSin, opExponentialLinearUnit, opStableSigmoid,
+    opSigmoid, opTanh, opSqr, opSqrt, opExp, opLog, opLinearRectifier, opCosine, opSin, opCosh, opSinh, opExponentialLinearUnit, opStableSigmoid,
     // unary ops for use by Matrix class only (there is no TensorView implementation)
     opSigmoidDerivative, opLinearRectifierDerivative, opNegativeSine, opExponentialLinearUnitDerivative, opStableSigmoidDerivative,
     // binary
@@ -96,6 +96,7 @@ enum ElementWiseOperator
     opElementwiseProductWithSigmoidDerivativeFromOutput, opElementwiseProductWithTanhDerivativeFromOutput,
     opElementwiseProductWithLinearRectifierDerivativeFromOutput, opElementwiseProductWithLogDerivativeFromOutput,
     opElementwiseProductWithCosDerivative, opElementwiseProductWithSinDerivative,
+    opElementwiseProductWithCoshDerivative, opElementwiseProductWithSinhDerivative,
     opElementwiseProductWithAbsDerivative, opElementwiseProductWithSqrtDerivative,
     opElementwiseProductWithReciprocalDerivative, opSqrOfDifference,
     opElementwiseProductWithExponentialLinearUnitDerivativeFromOutput,
@@ -133,6 +134,8 @@ enum ElementWiseOperator
     Macro(LinearRectifier);       \
     Macro(Cosine);                \
     Macro(Sin);                   \
+    Macro(Cosh);                  \
+    Macro(Sinh);                  \
     Macro(ExponentialLinearUnit); \
     Macro(StableSigmoid);
 
@@ -163,6 +166,8 @@ enum ElementWiseOperator
     Macro(ElementwiseProductWithLogDerivativeFromOutput);                    \
     Macro(ElementwiseProductWithCosDerivative);                              \
     Macro(ElementwiseProductWithSinDerivative);                              \
+    Macro(ElementwiseProductWithCoshDerivative);                             \
+    Macro(ElementwiseProductWithSinhDerivative);                             \
     Macro(ElementwiseProductWithAbsDerivative);                              \
     Macro(ElementwiseProductWithReciprocalDerivative);                       \
     Macro(ElementwiseProductWithSqrtDerivative);                             \
diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu
index 221ec378c397..43f55592844f 100755
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@@ -458,6 +458,10 @@ void GPUMatrix<ElemType>::performElementWiseFunction(ElementWiseOperator kind, c
         return _elementWiseCosineOnCuda<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(src, Data(), N);
     case ElementWiseOperator::opNegativeSine:
         return _elementWiseNegativeSineOnCuda<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(src, Data(), N);
+    case ElementWiseOperator::opCosh:
+        return _elementWiseCoshOnCuda<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(src, Data(), N);
+    case ElementWiseOperator::opSinh:
+        return _elementWiseSinhOnCuda<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(src, Data(), N);
     case ElementWiseOperator::opSigmoidDerivative:
         return _elementWiseSigmoidDerivativeOnCuda<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(src, Data(), N);
     default: LogicError("performElementWiseFunction: unexpected op code %d", (int)kind);
@@ -2333,6 +2337,12 @@ DEF_ELEMWISE_ASSIGN_FUNC(Cosine)
 DEF_ELEMWISE_INPLACE_FUNC(NegativeSine)
 DEF_ELEMWISE_ASSIGN_FUNC(NegativeSine)
 
+DEF_ELEMWISE_INPLACE_FUNC(Cosh)
+DEF_ELEMWISE_ASSIGN_FUNC(Cosh)
+
+DEF_ELEMWISE_INPLACE_FUNC(Sinh)
+DEF_ELEMWISE_ASSIGN_FUNC(Sinh)
+
 template <class ElemType>
 GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncateBottom(const ElemType threshold)
 {
diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h
index 39ebd5ce1461..369fef96f92e 100755
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@@ -380,6 +380,12 @@ class MATH_API GPUMatrix : public BaseMatrix<ElemType>
     GPUMatrix<ElemType>& InplaceNegativeSine();
     GPUMatrix<ElemType>& AssignNegativeSineOf(const GPUMatrix<ElemType>& a);
 
+    GPUMatrix<ElemType>& InplaceCosh();
+    GPUMatrix<ElemType>& AssignCoshOf(const GPUMatrix<ElemType>& a);
+
+    GPUMatrix<ElemType>& InplaceSinh();
+    GPUMatrix<ElemType>& AssignSinhOf(const GPUMatrix<ElemType>& a);
+
     GPUMatrix<ElemType>& InplaceAbs();
     GPUMatrix<ElemType>& AssignAbsOf(const GPUMatrix<ElemType>& a);
 
diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh
index 61e9e2a8ce20..4a6e0d6fdf84 100755
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@@ -378,6 +378,26 @@ __global__ void _elementWiseNegativeSineOnCuda(
     res[id] = -sin_(a[id]);
 };
 
+template <class ElemType>
+__global__ void _elementWiseCoshOnCuda(
+    const ElemType* a,
+    ElemType* res,
+    const CUDA_LONG N)
+{
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+    res[id] = cosh_(a[id]);
+};
+
+template <class ElemType>
+__global__ void _elementWiseSinhOnCuda(
+    const ElemType* a,
+    ElemType* res,
+    const CUDA_LONG N)
+{
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+    res[id] = sinh_(a[id]);
+};
+
 template <class ElemType>
 __global__ void _setValue(
     ElemType* a,
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index 2e62ff501240..30f0045db119 100755
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -3133,6 +3133,72 @@ Matrix<ElemType>& Matrix<ElemType>::AssignNegativeSineOf(const Matrix<ElemType>&
     return *this;
 }
 
+//[this]=cosh([this]) element wise
+template <class ElemType>
+Matrix<ElemType>& Matrix<ElemType>::InplaceCosh()
+{
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->InplaceCosh(),
+                            m_GPUMatrix->InplaceCosh(),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+
+    return *this;
+}
+
+template <class ElemType>
+Matrix<ElemType>& Matrix<ElemType>::AssignCoshOf(const Matrix<ElemType>& a)
+{
+    if (a.IsEmpty())
+        LogicError("AssignCoshOf: Matrix a is empty.");
+
+    DecideAndMoveToRightDevice(a, *this);
+    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
+
+    DISPATCH_MATRIX_ON_FLAG(&a,
+                            this,
+                            m_CPUMatrix->AssignCoshOf(*a.m_CPUMatrix),
+                            m_GPUMatrix->AssignCoshOf(*a.m_GPUMatrix),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+
+    return *this;
+}
+
+//[this]=sinh([this]) element wise
+template <class ElemType>
+Matrix<ElemType>& Matrix<ElemType>::InplaceSinh()
+{
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->InplaceSinh(),
+                            m_GPUMatrix->InplaceSinh(),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+
+    return *this;
+}
+
+template <class ElemType>
+Matrix<ElemType>& Matrix<ElemType>::AssignSinhOf(const Matrix<ElemType>& a)
+{
+    if (a.IsEmpty())
+        LogicError("AssignSinhOf: Matrix a is empty.");
+
+    DecideAndMoveToRightDevice(a, *this);
+    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
+
+    DISPATCH_MATRIX_ON_FLAG(&a,
+                            this,
+                            m_CPUMatrix->AssignSinhOf(*a.m_CPUMatrix),
+                            m_GPUMatrix->AssignSinhOf(*a.m_GPUMatrix),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+
+    return *this;
+}
+
 template <class ElemType>
 Matrix<ElemType>& Matrix<ElemType>::InplaceTruncate(const ElemType threshold)
 {
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index 72ef97bb524e..514f29a32a27 100755
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -407,6 +407,12 @@ class MATH_API Matrix : public MatrixBase
     Matrix<ElemType>& InplaceNegativeSine();
     Matrix<ElemType>& AssignNegativeSineOf(const Matrix<ElemType>& a);
 
+    Matrix<ElemType>& InplaceCosh();
+    Matrix<ElemType>& AssignCoshOf(const Matrix<ElemType>& a);
+
+    Matrix<ElemType>& InplaceSinh();
+    Matrix<ElemType>& AssignSinhOf(const Matrix<ElemType>& a);
+
     Matrix<ElemType>& InplaceLog10();
     Matrix<ElemType>& AssignLog10Of(const Matrix<ElemType>& a);
 
diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp
index 1fd0dd509663..3426471ee442 100755
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@@ -1535,6 +1535,30 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignNegativeSineOf(const GPUMatrix<E
     return *this;
 }
 
+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceCosh()
+{
+    return *this;
+}
+
+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignCoshOf(const GPUMatrix<ElemType>& /*a*/)
+{
+    return *this;
+}
+
+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceSinh()
+{
+    return *this;
+}
+
+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSinhOf(const GPUMatrix<ElemType>& /*a*/)
+{
+    return *this;
+}
+
 template <class ElemType>
 GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncateBottom(const ElemType threshold)
 {
diff --git a/Source/Math/TensorOps.h b/Source/Math/TensorOps.h
index 26abdb5ca2be..7b2dd07b2629 100644
--- a/Source/Math/TensorOps.h
+++ b/Source/Math/TensorOps.h
@@ -49,6 +49,8 @@ OverloadUnaryMathFns(cos);
 OverloadUnaryMathFns(sin);
 OverloadUnaryMathFns(floor);
 OverloadUnaryMathFns(log1p);
+OverloadUnaryMathFns(sinh);
+OverloadUnaryMathFns(cosh);
 
 #pragma pop_macro("OverloadUnaryMathFns")
 
@@ -271,6 +273,8 @@ DefUnaryOp(Sin, sin_(a));
 DefUnaryOp(Reciprocal, a == 0 ? 0 : 1 / a);
 DefUnaryOp(ExponentialLinearUnit, a >= 0 ? a : (exp_(a)-1));
 DefUnaryOp(StableSigmoid, StableSigmoid(a));
+DefUnaryOp(Sinh, sinh_(a));
+DefUnaryOp(Cosh, cosh_(a));
 #pragma pop_macro("DefUnaryOp")
 
 #pragma push_macro("DefBinaryOp")
@@ -312,6 +316,8 @@ DefBinaryOp(ElementwiseProductWithReciprocalDerivative, a * -Sqr(b)); // b = out
 DefBinaryOp(ElementwiseProductWithSqrtDerivative, a / (2 * b)); // b = output; d/dx sqrt(x) = 1/(2 * sqrt(x)) --> note this is the same as ElementwiseQuotient w a constant; if more show up like this we should add more template params
 DefBinaryOp(SqrOfDifference, Sqr(a - b));
 DefBinaryOp(ElementwiseProductWithExponentialLinearUnitDerivativeFromOutput, b >= 0 ? a : a*(1+b)); // b = output;
+DefBinaryOp(ElementwiseProductWithSinhDerivative, a * cosh_(b)); // note: b = input for sinh()
+DefBinaryOp(ElementwiseProductWithCoshDerivative, a * sinh_(b)); // note: b = input for cosh()
 //DefBinaryOp(Index, IndexElement(a, b, i));  // note: this one uses the third argument
 
 #pragma pop_macro("DefBinaryOp")
diff --git a/Source/Readers/HTKDeserializers/ConfigHelper.cpp b/Source/Readers/HTKDeserializers/ConfigHelper.cpp
index 94df730c80fe..871c8bf975aa 100644
--- a/Source/Readers/HTKDeserializers/ConfigHelper.cpp
+++ b/Source/Readers/HTKDeserializers/ConfigHelper.cpp
@@ -7,6 +7,7 @@
 #include "ConfigHelper.h"
 #include "DataReader.h"
 #include "StringUtil.h"
+#include <boost/algorithm/string.hpp>
 
 namespace CNTK {
 
@@ -164,9 +165,16 @@ vector<wstring> ConfigHelper::GetMlfPaths() const
         }
 
         wstring list = m_config(L"mlfFileList");
-        for (msra::files::textreader r(list); r;)
+        if (list.find(':') == string::npos)
         {
-            result.push_back(r.wgetline());
+            for (msra::files::textreader r(list); r;)
+            {
+                result.push_back(r.wgetline());
+            }
+        }
+        else
+        {
+            result = m_config(L"mlfFileList", ConfigParameters::Array(stringargvector(vector<wstring>{})));
         }
     }
 
diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/deeprl_test.py b/Tests/EndToEndTests/CNTKv2Python/Examples/deeprl_test.py
new file mode 100644
index 000000000000..0de6f86b2e3e
--- /dev/null
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/deeprl_test.py
@@ -0,0 +1,41 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import os
+import platform
+import shelve
+import shutil
+import subprocess
+
+import pytest
+
+
+def test_deeprl():
+    if platform.system() != 'Linux':
+        pytest.skip('test only runs on Linux (Gym Atari dependency)')
+
+    test_dir = os.path.dirname(os.path.abspath(__file__))
+    script_dir = os.path.join(test_dir, '..', '..', '..', '..', 'Examples',
+                              'ReinforcementLearning', 'deeprl', 'scripts')
+    script_file = os.path.join(script_dir, 'run.py')
+    config_file = os.path.join(script_dir, 'config_examples',
+                               'qlearning.config')
+
+    subprocess.call([
+        'python', script_file, '--env=CartPole-v0', '--max_steps=6000',
+        '--agent_config=' + config_file, '--eval_period=1000',
+        '--eval_steps=20000'
+    ])
+
+    assert os.path.exists(
+        os.path.join(test_dir, 'output', 'output.params')) == True
+
+    wks = shelve.open(os.path.join(test_dir, 'output', 'output.wks'))
+    rewards = wks['reward_history']
+    assert len(rewards) >= 5 and len(rewards) <= 6
+    assert max(rewards) >= 120
+
+    shutil.rmtree(os.path.join(test_dir, 'output'))
diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py b/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py
index 18e6b5d3b4f6..4b70bf29aa71 100644
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py
@@ -6,7 +6,6 @@
 abs_path = os.path.dirname(os.path.abspath(__file__))
 data_path = os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Speech", "AN4", "Data")
 
-
 def test_htk_deserializers():
     mbsize = 640
     epoch_size = 1000 * mbsize
@@ -55,4 +54,28 @@ def test_htk_deserializers():
     assert True
     os.chdir(abs_path)
 
-#test_htk_deserializers()
+
+def test_multiple_mlf_files():
+    os.chdir(data_path)
+
+    feature_dim = 33
+    num_classes = 132
+    context = 2
+
+    test_mlf_path = "../../../../Tests/EndToEndTests/Speech/Data/glob_00001.mlf"
+
+    features_file = "glob_0000.scp"
+    label_files = [ "glob_0000.mlf", test_mlf_path]
+    label_mapping_file = "state.list"
+
+    fd = HTKFeatureDeserializer(StreamDefs(
+        amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file)))
+
+    ld = HTKMLFDeserializer(label_mapping_file, StreamDefs(
+        awesome_labels = StreamDef(shape=num_classes, mlf=label_files)))
+
+    # Make sure we can read at least one minibatch.
+    mbsource = MinibatchSource([fd,ld])
+    mbsource.next_minibatch(1)
+
+    os.chdir(abs_path)
diff --git a/Tests/EndToEndTests/Speech/Data/glob_00001.mlf b/Tests/EndToEndTests/Speech/Data/glob_00001.mlf
new file mode 100644
index 000000000000..5f1c0b02a380
--- /dev/null
+++ b/Tests/EndToEndTests/Speech/Data/glob_00001.mlf
@@ -0,0 +1,5 @@
+#!MLF!#
+"nonexistent.lab"
+0 100000 sil[2] -0.785971 sil 454.794006 </s>
+100000 5500000 sil[3] 465.522034
+.
diff --git a/Tests/UnitTests/MathTests/CPUMatrixTests.cpp b/Tests/UnitTests/MathTests/CPUMatrixTests.cpp
index 5fbedf25b704..ed7ae41027e0 100755
--- a/Tests/UnitTests/MathTests/CPUMatrixTests.cpp
+++ b/Tests/UnitTests/MathTests/CPUMatrixTests.cpp
@@ -431,6 +431,26 @@ BOOST_FIXTURE_TEST_CASE(CPUMatrixElementOperations, RandomSeedFixture)
     m_NegSine.SetValue(m_Trig);
     m_NegSine.AssignNegativeSineOf(m_Trig);
     BOOST_CHECK(m_NegSine.IsEqualTo(m_NegSine_expected, c_epsilonFloatE4));
+
+    m3.SetValue(m0);
+    m3.InplaceCosh();
+    m2(0, 0) = 1.54308063;
+    m2(0, 1) = 3.76219569;
+    m2(0, 2) = 10.067662;
+    m2(1, 0) = 27.30823284;
+    m2(1, 1) = 74.20994852;
+    m2(1, 2) = 201.71563612;
+    BOOST_CHECK(m3.IsEqualTo(m2, c_epsilonFloatE4));
+
+    m3.SetValue(m0);
+    m3.InplaceSinh();
+    m2(0, 0) = 1.17520119;
+    m2(0, 1) = 3.62686041;
+    m2(0, 2) = 10.01787493;
+    m2(1, 0) = 27.2899172;
+    m2(1, 1) = 74.20321058;
+    m2(1, 2) = 201.71315737;
+    BOOST_CHECK(m3.IsEqualTo(m2, c_epsilonFloatE4));
 }
 
 BOOST_FIXTURE_TEST_CASE(CPUMatrixNorms, RandomSeedFixture)
diff --git a/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp b/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp
index dd27feff3432..9acff559c442 100755
--- a/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp
@@ -338,7 +338,9 @@ void CheckEnumValuesNotModified() {
                   static_cast<size_t>(PrimitiveOpType::Assign) == 73 &&
                   static_cast<size_t>(PrimitiveOpType::Gather) == 74 &&
                   static_cast<size_t>(PrimitiveOpType::StableSigmoid) == 75 && 
-                  static_cast<size_t>(PrimitiveOpType::RandomDistribution) == 76,
+                  static_cast<size_t>(PrimitiveOpType::RandomDistribution) == 76 &&
+                  static_cast<size_t>(PrimitiveOpType::Sinh) == 77 &&
+                  static_cast<size_t>(PrimitiveOpType::Cosh) == 78,
                   "PrimitiveOpType enum value was modified.");
 }
 
diff --git a/bindings/common/CNTKManagedCommon.i b/bindings/common/CNTKManagedCommon.i
index 53a3132cd07a..0a0e3a9a3542 100644
--- a/bindings/common/CNTKManagedCommon.i
+++ b/bindings/common/CNTKManagedCommon.i
@@ -152,6 +152,8 @@ IGNORE_FUNCTION CNTK::Sigmoid;
 IGNORE_FUNCTION CNTK::Tanh;
 IGNORE_FUNCTION CNTK::Sin;
 IGNORE_FUNCTION CNTK::Cos;
+IGNORE_FUNCTION CNTK::Cosh;
+IGNORE_FUNCTION CNTK::Sinh;
 IGNORE_FUNCTION CNTK::ReLU;
 IGNORE_FUNCTION CNTK::Exp;
 IGNORE_FUNCTION CNTK::Log;
diff --git a/bindings/python/cntk/contrib/deeprl/README.md b/bindings/python/cntk/contrib/deeprl/README.md
new file mode 100644
index 000000000000..d1745ead8f36
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/README.md
@@ -0,0 +1,26 @@
+CNTK DeepRL toolkit implements deep Q learning (and its variants) and actor-critic method.
+Tabular Q learning and random agent are also provided for baseline comparison.
+
+The observation space and action space are represented by an OpenAI gym space type, see
+https://github.com/openai/gym/tree/master/gym/spaces. Currently the toolkit limits
+action space to be discrete https://github.com/openai/gym/blob/master/gym/spaces/discrete.py,
+i.e., action is denoted by an integer between 0 and n-1 for n possible actions.
+The observation space can be arbitrary expect Tuple https://github.com/openai/gym/blob/master/gym/spaces/tuple_space.py.
+
+An example script is provided at CNTK/Examples/ReinforcementLearning/deeprl/scripts/run.py,
+which interacts with environment, and does training and evaluation. Training details
+are specified via a configure file. See CNTK/Examples/ReinforcementLearning/deeprl/config_examples
+for example configure file for deep Q learning and actor-critic method.
+
+For problem to be solved by deep RL algorithms, describe the problem as an environment following
+the examples at CNTK/Examples/ReinforcementLearning/deeprl/env.
+
+References:
+deep Q learning
+- DQN https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
+- Prioritized Experience Replay https://arxiv.org/pdf/1511.05952.pdf
+- Dueling Network https://arxiv.org/pdf/1511.06581.pdf
+- Double Q Learning https://arxiv.org/pdf/1509.06461.pdf
+
+actor-critic
+- Actor-Critic https://arxiv.org/pdf/1602.01783.pdf
diff --git a/bindings/python/cntk/contrib/deeprl/__init__.py b/bindings/python/cntk/contrib/deeprl/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/bindings/python/cntk/contrib/deeprl/agent/__init__.py b/bindings/python/cntk/contrib/deeprl/agent/__init__.py
new file mode 100644
index 000000000000..8b137891791f
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/__init__.py
@@ -0,0 +1 @@
+
diff --git a/bindings/python/cntk/contrib/deeprl/agent/agent.py b/bindings/python/cntk/contrib/deeprl/agent/agent.py
new file mode 100644
index 000000000000..7a758365f086
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/agent.py
@@ -0,0 +1,231 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Base class for defining an agent."""
+
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+
+from importlib import import_module
+
+from .shared.discretize import BoxSpaceDiscretizer
+
+
+class AgentBaseClass(object):
+    """Base class for defining an agent."""
+
+    __metaclass__ = ABCMeta
+
+    def __init__(self, o_space, a_space):
+        """
+        Constructor for AgentBaseClass.
+
+        Args:
+            o_space: observation space, gym.spaces.tuple_space.Tuple is not
+                supported.
+            a_space: action space, limits to gym.spaces.discrete.Discrete.
+        """
+        if self._classname(a_space) != 'gym.spaces.discrete.Discrete':
+            raise ValueError(
+                'Action space {0} incompatible with {1}. (Only supports '
+                'Discrete action spaces.)'.format(a_space, self))
+        self._num_actions = a_space.n
+
+        # We assume the observation is in one of the following cases:
+        # 1. discrete, and takes values from 0 to n - 1
+        # 2. can be discretized, and the raw state is converted to an internal
+        #    state taking values from 0 to n - 1
+        # 3. raw, such as images from Atari games
+        #
+        # OpenAI gym supports the following observation types:
+        # Discrete, Box, MultiBinary, MultiDiscrete and Tuple. Discrete
+        # corresponds to case 1. Box, MultiBinary and MultiDiscrete can be
+        # either case 2 or 3. Tuple is a mix of case 1, 2 or 3, and is not
+        # supported currently.
+        #
+        # The observation-related parameters are defined as follows:
+        # _discrete_observation_space: True for cases 1 and 2, False otherwise.
+        #   State is represented by a scalar.
+        # _space_discretizer: Not none for case 2 to indicate a conversion on
+        #   state is requried. None otherwise.
+        # _shape_of_inputs: (n, ) for cases 1 and 2 to indicate it is a vector
+        #   of length n. For case 3, it is the shape of array that represents
+        #   the state. For example, an image input will have shape denoted as
+        #   tuple (channel, width, height).
+        if not (self._classname(o_space) == 'gym.spaces.discrete.Discrete' or
+                self._classname(o_space) == 'gym.spaces.multi_binary.MultiBinary' or
+                self._classname(o_space) == 'gym.spaces.box.Box' or
+                self._classname(o_space) == 'gym.spaces.multi_discrete.MultiDiscrete'):
+            raise ValueError(
+                'Unsupported observation space type: {0}'.format(o_space))
+
+        self._space_discretizer = None
+        self._discrete_observation_space = \
+            (self._classname(o_space) == 'gym.spaces.discrete.Discrete')
+        # Set self._num_states for discrete observation space only.
+        # Otherwise set it to None so that an exception will be raised
+        # should it be used later in the code.
+        self._num_states = \
+            o_space.n if self._discrete_observation_space else None
+
+        if (self._classname(o_space) == 'gym.spaces.discrete.Discrete' or
+            self._classname(o_space) == 'gym.spaces.multi_binary.MultiBinary'):
+            self._shape_of_inputs = (o_space.n,)
+        else:
+            self._shape_of_inputs = o_space.shape
+
+        self._preprocessor = None
+        self._best_model = None
+
+    @abstractmethod
+    def start(self, state):
+        """
+        Start a new episode.
+
+        Args:
+            state (object): observation provided by the environment.
+
+        Returns:
+            action (int): action choosen by agent.
+            debug_info (dict): auxiliary diagnostic information.
+        """
+        pass
+
+    @abstractmethod
+    def step(self, reward, next_state):
+        """
+        Observe one transition and choose an action.
+
+        Args:
+            reward (float) : amount of reward returned after previous action.
+            next_state (object): observation provided by the environment.
+
+        Returns:
+            action (int): action choosen by agent.
+            debug_info (dict): auxiliary diagnostic information.
+        """
+        pass
+
+    @abstractmethod
+    def end(self, reward, next_state):
+        """
+        Last observed reward/state of the episode (which then terminates).
+
+        Args:
+            reward (float) : amount of reward returned after previous action.
+            next_state (object): observation provided by the environment.
+        """
+        pass
+
+    @abstractmethod
+    def save(self, filename):
+        """Save model to file."""
+        pass
+
+    @abstractmethod
+    def save_parameter_settings(self, filename):
+        """Save parameter settings to file."""
+        pass
+
+    @abstractmethod
+    def set_as_best_model(self):
+        """Copy current model to best model."""
+        pass
+
+    def enter_evaluation(self):
+        """Setup before evaluation."""
+        pass
+
+    def exit_evaluation(self):
+        """Tear-down after evaluation."""
+        pass
+
+    def evaluate(self, o):
+        """
+        Choose action for given observation without updating agent's status.
+
+        Args:
+            o (object): observation provided by the environment.
+
+        Returns:
+            action (int): action choosen by agent.
+        """
+        a, _ = self._choose_action(self._preprocess_state(o))
+        return a
+
+    @abstractmethod
+    def _choose_action(self, state):
+        """
+        Choose an action according to the policy.
+
+        Args:
+            state (object): observation seen by agent, which can be different
+                from what is provided by the environment. The difference comes
+                from preprcessing.
+
+        Returns:
+            action (int): action choosen by agent.
+            debug_info (str): auxiliary diagnostic information.
+        """
+        pass
+
+    def _discretize_observation_space(self, space, discretization_resolution):
+        if self._classname(space) == 'gym.spaces.box.Box':
+            self._space_discretizer = BoxSpaceDiscretizer(
+                space,
+                discretization_resolution)
+            self._discrete_observation_space = True
+            self._num_states = self._space_discretizer.num_states
+            self._shape_of_inputs = (self._num_states,)
+        else:
+            raise ValueError(
+                "Unsupported space type for discretization: {0}".format(space))
+
+    def _discretize_state_if_necessary(self, state):
+        if self._space_discretizer is not None:
+            return self._space_discretizer.discretize(state)
+        else:
+            return state
+
+    def _index_to_vector(self, index, dimension):
+        # TODO: consider using cntk.core.Value.one_hot here.
+        a = np.zeros(dimension,)
+        a[index] = 1
+        return a
+
+    def _preprocess_state(self, state):
+        """Preprocess state to generate input to neural network.
+
+        When state is a scalar which is the index of the state space, convert
+        it using one-hot encoding.
+
+        For other cases, state and input are the same, roughly.
+
+        CNTK only supports float32 and float64. Performs appropriate
+        type conversion as well.
+        """
+        o = self._discretize_state_if_necessary(state)
+        if self._discrete_observation_space:
+            o = self._index_to_vector(o, self._num_states)
+        if self._preprocessor is not None:
+            o = self._preprocessor.preprocess(o)
+        # TODO: allow float64 dtype.
+        if o.dtype.name != 'float32':
+            o = o.astype(np.float32)
+        return o
+
+    def _classname(self, instance):
+        return instance.__class__.__module__ + '.' + instance.__class__.__name__
+
+    def _import_method(self, path):
+        """Import method specified as module_name.method_name."""
+        module_name, method_name = path.rsplit('.', 1)
+        try:
+            module = import_module(module_name)
+            method = getattr(module, method_name)
+        except (AttributeError, ImportError):
+            raise ValueError('Cannot import method: "{0}"'.format(path))
+        return method
diff --git a/bindings/python/cntk/contrib/deeprl/agent/agent_factory.py b/bindings/python/cntk/contrib/deeprl/agent/agent_factory.py
new file mode 100644
index 000000000000..ae053a8f1d39
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/agent_factory.py
@@ -0,0 +1,45 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Factory method to create an agent."""
+
+import configparser
+
+from .policy_gradient import ActorCritic
+from .qlearning import QLearning
+from .random_agent import RandomAgent
+from .tabular_qlearning import TabularQLearning
+
+
+def make_agent(agent_config, o_space, a_space):
+    """
+    Choose appropriate method to create an agent.
+
+    Args:
+        agent_config: configure file specifying the agent type as well as
+            training details.
+        o_space: observation space, gym.spaces.tuple_space.Tuple is not
+            supported.
+        a_space: action space, limits to gym.spaces.discrete.Discrete.
+
+    Returns:
+        subclass inherited from :class:`.agent.AgentBaseClass`: QLearning,
+            ActorCritic, TabularQLearning, or RandomAgent.
+    """
+    config = configparser.ConfigParser()
+    config.read(agent_config)
+
+    agent_type = config.get(
+        'General', 'Agent', fallback='random').lower()
+    agent = None
+    if agent_type == 'qlearning':
+        agent = QLearning(agent_config, o_space, a_space)
+    elif agent_type == 'actor_critic':
+        agent = ActorCritic(agent_config, o_space, a_space)
+    elif agent_type == 'tabular_qlearning':
+        agent = TabularQLearning(agent_config, o_space, a_space)
+    elif agent_type == 'random':
+        agent = RandomAgent(o_space, a_space)
+    return agent
diff --git a/bindings/python/cntk/contrib/deeprl/agent/policy_gradient.py b/bindings/python/cntk/contrib/deeprl/agent/policy_gradient.py
new file mode 100644
index 000000000000..d65d4533167d
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/policy_gradient.py
@@ -0,0 +1,373 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Actor-Critic Policy Gradient."""
+
+import cntk as C
+import numpy as np
+
+import ast
+
+from .agent import AgentBaseClass
+from .shared.cntk_utils import negative_of_entropy_with_softmax
+from .shared.models import Models
+from .shared.policy_gradient_parameters import PolicyGradientParameters
+
+
+class ActorCritic(AgentBaseClass):
+    """
+    Actor-Critic Policy Gradient.
+
+    See https://arxiv.org/pdf/1602.01783.pdf for a description of algorithm.
+    """
+
+    def __init__(self, config_filename, o_space, a_space):
+        """
+        Constructor for policy gradient.
+
+        Args:
+            config_filename: configure file specifying training details.
+            o_space: observation space, gym.spaces.tuple_space.Tuple is not
+                supported.
+            a_space: action space, limits to gym.spaces.discrete.Discrete.
+        """
+        super(ActorCritic, self).__init__(o_space, a_space)
+
+        self._parameters = PolicyGradientParameters(config_filename)
+
+        # Create preprocessor.
+        if self._parameters.preprocessing:
+            preproc = self._import_method(self._parameters.preprocessing)
+            self._preprocessor = preproc(
+                self._shape_of_inputs,
+                *ast.literal_eval(self._parameters.preprocessing_args))
+
+        self._set_up_policy_network_and_value_network()
+
+        self._trajectory_states = []
+        self._trajectory_actions = []
+        self._trajectory_rewards = []
+
+        # Training data for the policy and value networks. Note they share the
+        # same input.
+        self._input_buffer = []
+        self._value_network_output_buffer = []
+        self._policy_network_output_buffer = []
+        self._policy_network_weight_buffer = []
+
+        self.episode_count = 0
+        self.step_count = 0
+
+    def start(self, state):
+        """
+        Start a new episode.
+
+        Args:
+            state (object): observation provided by the environment.
+
+        Returns:
+            action (int): action choosen by agent.
+            debug_info (dict): auxiliary diagnostic information.
+        """
+        # Call _process_accumulated_trajectory() to process unused trajectory
+        # data from previous episode.
+        self._process_accumulated_trajectory(False)
+
+        # Reset preprocessor.
+        if self._preprocessor is not None:
+            self._preprocessor.reset()
+
+        # Append new state and action
+        o = self._preprocess_state(state)
+        action, _ = self._choose_action(o)
+        self._trajectory_states.append(o)
+        self._trajectory_actions.append(action)
+
+        self.episode_count += 1
+
+        return action, {}
+
+    def step(self, reward, next_state):
+        """
+        Observe one transition and choose an action.
+
+        Args:
+            reward (float) : amount of reward returned after previous action.
+            next_state (object): observation provided by the environment.
+
+        Returns:
+            action (int): action choosen by agent.
+            debug_info (dict): auxiliary diagnostic information.
+        """
+        o = self._preprocess_state(next_state)
+        self._trajectory_rewards.append(reward)
+        self._trajectory_states.append(o)
+        self.step_count += 1
+
+        # Update every self._parameters.update_frequency
+        if self.step_count % self._parameters.update_frequency == 0:
+            self._process_accumulated_trajectory(True)
+            self._update_networks()
+
+        action, _ = self._choose_action(o)
+        self._trajectory_actions.append(action)
+        return action, {}
+
+    def end(self, reward, next_state):
+        """
+        Last observed reward/state of the episode (which then terminates).
+
+        Args:
+            reward (float) : amount of reward returned after previous action.
+            next_state (object): observation provided by the environment.
+        """
+        self._trajectory_rewards.append(reward)
+        self.step_count += 1
+
+        # Update every self._parameters.update_frequency
+        if self.step_count % self._parameters.update_frequency == 0:
+            self._process_accumulated_trajectory(False)
+            self._update_networks()
+
+    def set_as_best_model(self):
+        """Copy current model to best model."""
+        self._best_model = self._policy_network.clone('clone')
+
+    def _set_up_policy_network_and_value_network(self):
+        shape_of_inputs = self._shape_of_inputs if self._preprocessor is None \
+            else self._preprocessor.output_shape()
+        self._input_variables = \
+            C.ops.input_variable(shape=shape_of_inputs, dtype=np.float32)
+
+        # Set up policy network.
+        if self._parameters.policy_representation == 'nn':
+            model = Models.feedforward_network(
+                shape_of_inputs,
+                self._num_actions,
+                self._parameters.policy_network_hidden_layers,
+                C.losses.cross_entropy_with_softmax,
+                use_placeholder_for_input=True)
+        else:
+            try:
+                model_definition_function = self._import_method(
+                    self._parameters.policy_representation)
+                model = model_definition_function(
+                    shape_of_inputs,
+                    self._num_actions,
+                    C.losses.cross_entropy_with_softmax,
+                    use_placeholder_for_input=True)
+            except ValueError:
+                raise ValueError(
+                    'Unknown representation for policy: "{0}"'
+                    '\n'.format(self._parameters.policy_representation))
+
+        self._policy_network = model['f']
+        self._policy_network.replace_placeholder(self._input_variables)
+        self._policy_network_output_variables = model['outputs']
+        # The weight is computed as part of the Actor-Critic algorithm.
+        self._policy_network_weight_variables = \
+            C.ops.input_variable(shape=(1,), dtype=np.float32)
+        self._policy_network_loss = \
+            model['loss'] * self._policy_network_weight_variables
+
+        # Initialized from a saved model.
+        if self._parameters.initial_policy_network:
+            self._policy_network.restore(
+                self._parameters.initial_policy_network)
+
+        print("Parameterized the agent's policy using neural networks "
+              '"{0}" with {1} actions.\n'
+              ''.format(self._parameters.policy_representation,
+                        self._num_actions))
+
+        # Set up value network.
+        if self._parameters.shared_representation:
+            # For shared representation, policy pi and value function V share
+            # all non-output layers. To use cross_entropy_with_softmax loss
+            # from cntk, _policy_network defined here doesn't include softmax
+            # output layer. Therefore _value_network becomes _policy_network
+            # plus one additional linear output layer.
+            self._value_network = C.layers.Dense(1, activation=None)(
+                self._policy_network)
+            self._value_network_output_variables = C.ops.input_variable(
+                shape=(1,), dtype=np.float32)
+            self._value_network_loss = C.losses.squared_error(
+                self._value_network, self._value_network_output_variables)
+        else:
+            if self._parameters.value_function_representation == 'nn':
+                model = Models.feedforward_network(
+                    shape_of_inputs,
+                    1,  # value network outputs a scalar
+                    self._parameters.value_network_hidden_layers,
+                    use_placeholder_for_input=True)
+            else:
+                try:
+                    model_definition_function = self._import_method(
+                        self._parameters.value_function_representation)
+                    model = model_definition_function(
+                        shape_of_inputs,
+                        1,  # value network outputs a scalar
+                        use_placeholder_for_input=True)
+                except ValueError:
+                    raise ValueError(
+                        'Unknown representation for value function: "{0}"'
+                        '\n'.format(self._parameters.value_function_representation))
+
+            self._value_network = model['f']
+            self._value_network.replace_placeholder(self._input_variables)
+            self._value_network_output_variables = model['outputs']
+            self._value_network_loss = model['loss']  # squared_error by default
+
+        combined_networks = C.ops.combine(
+            [self._policy_network, self._value_network])
+        combined_loss = self._policy_network_loss + \
+            self._parameters.regularization_weight * \
+            negative_of_entropy_with_softmax(self._policy_network) + \
+            self._parameters.relative_step_size * self._value_network_loss
+
+        # The learning rate will be updated later before each minibatch
+        # training.
+        # TODO: allow user to specify learner through config file.
+        self._trainer = C.train.trainer.Trainer(
+            combined_networks,
+            (combined_loss, None),
+            C.learners.adam(
+                combined_networks.parameters,
+                C.learners.learning_rate_schedule(
+                    self._parameters.initial_eta,
+                    C.learners.UnitType.sample),
+                momentum=C.learners.momentum_schedule(self._parameters.momentum),
+                variance_momentum=C.learners.momentum_schedule(0.999),
+                use_mean_gradient=True))
+
+        print("Parameterized the agent's value function using neural network "
+              '"{0}".\n'.format(
+                self._parameters.policy_representation
+                if self._parameters.shared_representation
+                else self._parameters.value_function_representation))
+
+    def _adjust_learning_rate(self):
+        if self._parameters.initial_eta != self._parameters.eta_minimum:
+            eta = self._parameters.eta_minimum + max(
+                    0,
+                    (self._parameters.initial_eta - self._parameters.eta_minimum) *
+                    (1 - float(self.step_count)/self._parameters.eta_decay_step_count))
+            self._trainer.parameter_learners[0].reset_learning_rate(
+                C.learners.learning_rate_schedule(
+                    eta, C.learners.UnitType.sample))
+
+    def _choose_action(self, state):
+        """
+        Choose an action according to policy.
+
+        Args:
+            state (object): observation seen by agent, which can be different
+                from what is provided by the environment. The difference comes
+                from preprcessing.
+
+        Returns:
+            action (int): action choosen by agent.
+            debug_info (object): probability vector the action is sampled from.
+        """
+        action_probs = \
+            C.ops.softmax(self._evaluate_model(self._policy_network, state)).eval()
+        return np.random.choice(self._num_actions, p=action_probs), action_probs
+
+    def save(self, filename):
+        """Save model to file."""
+        self._best_model.save(filename)
+
+    def save_parameter_settings(self, filename):
+        """Save parameter settings to file."""
+        self._parameters.save(filename)
+
+    def _evaluate_model(self, model, state):
+        r"""Evaluate log of pi(\cdot|state) or v(state)."""
+        return np.squeeze(model.eval({model.arguments[0]: [state]}))
+
+    def _process_accumulated_trajectory(self, keep_last):
+        """Process accumulated trajectory to generate training data.
+
+        Args:
+            keep_last (bool): last state without action and reward will be kept
+                if True.
+        """
+        if not self._trajectory_states:
+            return
+
+        # If trajectory hasn't terminated, we have _trajectory_states
+        # and sometimes _trajectory_actions having one more item than
+        # _trajectory_rewards. Same length is expected if called from
+        # start() or end(), where the trajectory has terminiated.
+        if len(self._trajectory_states) == len(self._trajectory_rewards):
+            bootstrap_r = 0
+        else:
+            # Bootstrap from last state
+            bootstrap_r = np.asscalar(self._evaluate_model(
+                self._value_network, self._trajectory_states[-1]))
+            last_state = self._trajectory_states.pop()
+            if len(self._trajectory_actions) != len(self._trajectory_rewards):
+                # This will only happen when agent calls start() to begin
+                # a new episode without calling end() before to terminate the
+                # prevous episode. The last action thus can be discarded.
+                self._trajectory_actions.pop()
+
+        if len(self._trajectory_states) != len(self._trajectory_rewards) or \
+           len(self._trajectory_actions) != len(self._trajectory_rewards):
+            raise RuntimeError("Can't pair (state, action, reward). "
+                               "state/action can only be one more step ahead "
+                               "of rewrad in trajectory.")
+
+        for transition in zip(
+                self._trajectory_states,
+                self._trajectory_actions,
+                self._discount_rewards(bootstrap_r)):
+            self._input_buffer.append(transition[0])
+            self._value_network_output_buffer.append([transition[2]])
+            # TODO: consider using cntk.ops.one_hot instead of _index_to_vector
+            self._policy_network_output_buffer.append(
+                self._index_to_vector(transition[1], self._num_actions))
+            self._policy_network_weight_buffer.append([transition[2]
+                - self._evaluate_model(self._value_network, transition[0])])
+
+        # Clear the trajectory history.
+        self._trajectory_states = []
+        self._trajectory_actions = []
+        self._trajectory_rewards = []
+        if keep_last:
+            self._trajectory_states.append(last_state)
+
+    def _update_networks(self):
+        self._adjust_learning_rate()
+
+        # Train the policy network on one minibatch.
+        self._trainer.train_minibatch(
+            {
+                self._input_variables: np.array(self._input_buffer).astype(
+                    np.float32),
+                self._policy_network_output_variables:
+                    np.array(self._policy_network_output_buffer).astype(
+                        np.float32),
+                self._policy_network_weight_variables:
+                    np.array(self._policy_network_weight_buffer).astype(
+                        np.float32),
+                self._value_network_output_variables:
+                    np.array(self._value_network_output_buffer).astype(
+                        np.float32)
+            })
+
+        # Clear training data.
+        self._input_buffer = []
+        self._value_network_output_buffer = []
+        self._policy_network_output_buffer = []
+        self._policy_network_weight_buffer = []
+
+    def _discount_rewards(self, bootstrap_r):
+        discounted_rewards = [0] * len(self._trajectory_rewards)
+        r = bootstrap_r
+        for t in reversed(range(len(self._trajectory_rewards))):
+            r = r * self._parameters.gamma + self._trajectory_rewards[t]
+            discounted_rewards[t] = r
+        return discounted_rewards
diff --git a/bindings/python/cntk/contrib/deeprl/agent/qlearning.py b/bindings/python/cntk/contrib/deeprl/agent/qlearning.py
new file mode 100644
index 000000000000..f90b3a8332dd
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/qlearning.py
@@ -0,0 +1,381 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Deep Q-learning and its variants."""
+
+import math
+
+import cntk as C
+import numpy as np
+
+import ast
+
+from .agent import AgentBaseClass
+from .shared.cntk_utils import huber_loss
+from .shared.models import Models
+from .shared.qlearning_parameters import QLearningParameters
+from .shared.replay_memory import ReplayMemory
+
+
+class QLearning(AgentBaseClass):
+    """
+    Q-learning agent.
+
+    Including:
+    - DQN https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
+    - Prioritized Experience Replay https://arxiv.org/pdf/1511.05952.pdf
+    - Dueling Network https://arxiv.org/pdf/1511.06581.pdf
+    - Double Q Learning https://arxiv.org/pdf/1509.06461.pdf
+    """
+
+    def __init__(self, config_filename, o_space, a_space):
+        """Constructor for Q learning algorithm.
+
+        Widely known as DQN. Use either predefined neural network structure
+        (see models.py) or customized network (see customized_models.py).
+
+        Args:
+            config_filename: configure file specifying training details.
+            o_space: observation space, gym.spaces.tuple_space.Tuple is not
+                supported.
+            a_space: action space, limits to gym.spaces.discrete.Discrete.
+        """
+        super(QLearning, self).__init__(o_space, a_space)
+
+        self._parameters = QLearningParameters(config_filename)
+
+        # Create preprocessor.
+        if self._parameters.preprocessing:
+            try:
+                preproc = self._import_method(self._parameters.preprocessing)
+                self._preprocessor = preproc(
+                    self._shape_of_inputs,
+                    *ast.literal_eval(self._parameters.preprocessing_args))
+            except ValueError:
+                raise ValueError(
+                    'Unknown preprocessing method: "{0}"'
+                    '\n'.format(self._parameters.preprocessing))
+
+        # Set up the Q-function.
+        shape_of_inputs = self._shape_of_inputs \
+            if self._preprocessor is None \
+            else self._preprocessor.output_shape()
+        if self._parameters.q_representation == 'dqn':
+            model = Models.feedforward_network(
+                shape_of_inputs,
+                self._num_actions,
+                self._parameters.hidden_layers,
+                huber_loss if self._parameters.use_error_clipping else None)
+        elif self._parameters.q_representation == 'dueling-dqn':
+            model = Models.dueling_network(
+                shape_of_inputs,
+                self._num_actions,
+                self._parameters.hidden_layers,
+                huber_loss if self._parameters.use_error_clipping else None)
+        else:
+            try:
+                model_definition_function = self._import_method(
+                    self._parameters.q_representation)
+                model = model_definition_function(
+                    shape_of_inputs,
+                    self._num_actions,
+                    huber_loss if self._parameters.use_error_clipping else None)
+            except ValueError:
+                raise ValueError(
+                    'Unknown representation for Q-learning: "{0}"'
+                    '\n'.format(self._parameters.q_representation))
+
+        self._q = model['f']
+        self._input_variables = model['inputs']
+        self._output_variables = model['outputs']
+        if self._parameters.use_prioritized_replay:
+            self._weight_variables = \
+                C.ops.input_variable(shape=(1,), dtype=np.float32)
+            self._loss = model['loss'] * self._weight_variables
+        else:
+            self._loss = model['loss']
+
+        # If gradient_clipping_threshold_per_sample is inf, gradient clipping
+        # will not be performed. Set gradient_clipping_with_truncation to False
+        # to clip the norm.
+        # TODO: allow user to specify learner through config file.
+        opt = C.learners.adam(
+            self._q.parameters,
+            C.learners.learning_rate_schedule(
+                self._parameters.initial_eta, C.learners.UnitType.sample),
+            use_mean_gradient=True,
+            momentum=C.learners.momentum_schedule(self._parameters.momentum),
+            variance_momentum=C.learners.momentum_schedule(0.999),
+            gradient_clipping_threshold_per_sample=
+                self._parameters.gradient_clipping_threshold,
+            gradient_clipping_with_truncation=False)
+        self._trainer = C.train.trainer.Trainer(
+            self._q, (self._loss, None), opt)
+
+        # Initialize target Q.
+        self._target_q = self._q.clone('clone')
+
+        # Initialize replay memory.
+        self._replay_memory = ReplayMemory(
+            self._parameters.replay_memory_capacity,
+            self._parameters.use_prioritized_replay)
+
+        print('Parameterized Q-learning agent using neural networks '
+              '"{0}" with {1} actions.\n'
+              ''.format(self._parameters.q_representation,
+                        self._num_actions))
+
+        self.episode_count = 0
+        self.step_count = 0
+
+    def start(self, state):
+        """
+        Start a new episode.
+
+        Args:
+            state (object): observation provided by the environment.
+
+        Returns:
+            action (int): action choosen by agent.
+            debug_info (dict): auxiliary diagnostic information.
+        """
+        if self._preprocessor is not None:
+            self._preprocessor.reset()
+
+        self._adjust_exploration_rate()
+        self._last_state = self._preprocess_state(state)
+        self._last_action, action_behavior = \
+            self._choose_action(self._last_state)
+        self.episode_count += 1
+        return self._last_action, {
+            'action_behavior': action_behavior,
+            'epsilon': self._epsilon}
+
+    def step(self, reward, next_state):
+        """
+        Observe one transition and choose an action.
+
+        Args:
+            reward (float) : amount of reward returned after previous action.
+            next_state (object): observation provided by the environment.
+
+        Returns:
+            action (int): action choosen by agent.
+            debug_info (dict): auxiliary diagnostic information.
+        """
+        next_encoded_state = self._preprocess_state(next_state)
+        priority = self._compute_priority(
+            self._last_state, self._last_action, reward, next_encoded_state)
+        self._replay_memory.store(
+            self._last_state,
+            self._last_action,
+            reward,
+            next_encoded_state,
+            priority)
+        self.step_count += 1
+
+        # Update Q every self._parameters.q_update_frequency
+        self._update_q_periodically()
+
+        self._adjust_exploration_rate()
+        self._last_state = next_encoded_state
+        self._last_action, action_behavior = self._choose_action(
+            self._last_state)
+        return self._last_action, {
+            'action_behavior': action_behavior,
+            'epsilon': self._epsilon}
+
+    def end(self, reward, next_state):
+        """
+        Last observed reward/state of the episode (which then terminates).
+
+        Args:
+            reward (float) : amount of reward returned after previous action.
+            next_state (object): observation provided by the environment.
+        """
+        priority = self._compute_priority(
+            self._last_state, self._last_action, reward, None)
+        self._replay_memory.store(
+            self._last_state,
+            self._last_action,
+            reward,
+            None,
+            priority)
+        self.step_count += 1
+
+        # Update Q every self._parameters.q_update_frequency
+        self._update_q_periodically()
+
+    def set_as_best_model(self):
+        """Copy current model to best model."""
+        self._best_model = self._q.clone('clone')
+
+    def enter_evaluation(self):
+        """Setup before evaluation."""
+        self._epsilon = 0
+
+    def _adjust_learning_rate(self):
+        if self._parameters.initial_eta != self._parameters.eta_minimum:
+            eta = self._parameters.eta_minimum + max(
+                0,
+                (self._parameters.initial_eta - self._parameters.eta_minimum) *
+                (1 - float(self.step_count)/self._parameters.eta_decay_step_count))
+
+            self._trainer.parameter_learners[0].reset_learning_rate(
+                C.learners.learning_rate_schedule(
+                    eta, C.learners.UnitType.sample))
+
+    def _adjust_exploration_rate(self):
+        self._epsilon = self._parameters.epsilon_minimum + max(
+            0,
+            (self._parameters.initial_epsilon - self._parameters.epsilon_minimum) *
+            (1 - float(self.step_count)/self._parameters.epsilon_decay_step_count))
+
+    def _choose_action(self, state):
+        """
+        Epsilon greedy policy.
+
+        Args:
+            state (object): observation seen by agent, which can be different
+                from what is provided by the environment. The difference comes
+                from preprcessing.
+
+        Returns:
+            action (int): action choosen by agent.
+            debug_info (str): auxiliary diagnostic information.
+        """
+        if self.step_count < self._parameters.replay_start_size or \
+                np.random.uniform(0, 1) < self._epsilon:
+            return np.random.randint(self._num_actions), 'RANDOM'
+        else:
+            return np.argmax(self._evaluate_q(self._q, state)), 'GREEDY'
+
+    def save(self, filename):
+        """Save model to file."""
+        self._best_model.save(filename)
+
+    def save_parameter_settings(self, filename):
+        """Save parameter settings to file."""
+        self._parameters.save(filename)
+
+    def _evaluate_q(self, model, state, action=None):
+        """
+        Evaluate Q[state, action].
+
+        If action is None, return values for all actions.
+        Args:
+            state (object): observation seen by agent, which can be different
+                from what is provided by the environment. The difference comes
+                from preprcessing.
+            action (int): action choosen by agent.
+        """
+        q = np.squeeze(model.eval({model.arguments[0]: [state]}))
+        if action is None:
+            return q
+        else:
+            return q[action]
+
+    def _update_q_periodically(self):
+        if self.step_count < self._parameters.replay_start_size or \
+                self.step_count % self._parameters.q_update_frequency != 0:
+            return
+
+        self._adjust_learning_rate()
+        for i in range(self._parameters.replays_per_update):
+            self._replay_and_update()
+
+        # Clone target network periodically.
+        if self.step_count % \
+                self._parameters.target_q_update_frequency == 0:
+            self._target_q = self._q.clone('clone')
+
+    def _replay_and_update(self):
+        """Perform one minibatch update of Q."""
+        input_values = []
+        output_values = []
+        if self._parameters.use_prioritized_replay:
+            # importance sampling weights.
+            weight_values = []
+
+        minibatch = self._replay_memory.sample_minibatch(
+            self._parameters.minibatch_size)
+        for index_transition_pair in minibatch:
+            input_value = index_transition_pair[1].state
+
+            # output_value is the same for all actions except last_action.
+            output_value = self._evaluate_q(
+                self._q, index_transition_pair[1].state)
+            td_err = self._compute_td_err(
+                index_transition_pair[1].state,
+                index_transition_pair[1].action,
+                index_transition_pair[1].reward,
+                index_transition_pair[1].next_state)
+            output_value[index_transition_pair[1].action] += td_err
+
+            input_values.append(input_value)
+            output_values.append(output_value)
+
+            if self._parameters.use_prioritized_replay:
+                weight_values.append(math.pow(
+                    index_transition_pair[1].priority,
+                    -self._parameters.priority_beta))
+
+        if self._parameters.use_prioritized_replay:
+            w_sum = sum(weight_values)
+            weight_values = [[w / w_sum] for w in weight_values]
+            self._trainer.train_minibatch(
+                {
+                    self._input_variables: np.array(input_values).astype(
+                        np.float32),
+                    self._output_variables: np.array(output_values).astype(
+                        np.float32),
+                    self._weight_variables: np.array(weight_values).astype(
+                        np.float32)
+                })
+
+            # Update replay priority.
+            position_priority_map = {}
+            for index_transition_pair in minibatch:
+                position_priority_map[index_transition_pair[0]] = \
+                    self._compute_priority(
+                        index_transition_pair[1].state,
+                        index_transition_pair[1].action,
+                        index_transition_pair[1].reward,
+                        index_transition_pair[1].next_state)
+
+            self._replay_memory.update_priority(position_priority_map)
+        else:
+            self._trainer.train_minibatch(
+                {
+                    self._input_variables: np.array(input_values).astype(
+                        np.float32),
+                    self._output_variables: np.array(output_values).astype(
+                        np.float32)
+                })
+
+    def _compute_td_err(self, state, action, reward, next_state):
+        td_err = reward
+        if next_state is not None:
+            if self._parameters.double_q_learning:
+                td_err += self._parameters.gamma * \
+                    self._evaluate_q(
+                        self._target_q,
+                        next_state,
+                        np.argmax(self._evaluate_q(self._q, next_state)))
+            else:
+                td_err += self._parameters.gamma * np.max(
+                    self._evaluate_q(self._target_q, next_state))
+        td_err -= self._evaluate_q(self._q, state, action)
+        return td_err
+
+    def _compute_priority(self, state, action, reward, next_state):
+        priority = None
+        if self._parameters.use_prioritized_replay:
+            priority = math.pow(
+                math.fabs(self._compute_td_err(
+                    state, action, reward, next_state))
+                + self._parameters.priority_epsilon,
+                self._parameters.priority_alpha)
+        return priority
diff --git a/bindings/python/cntk/contrib/deeprl/agent/random_agent.py b/bindings/python/cntk/contrib/deeprl/agent/random_agent.py
new file mode 100644
index 000000000000..a82a3be43391
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/random_agent.py
@@ -0,0 +1,57 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Baseline agent that selects action uniformly randomly."""
+
+import numpy as np
+
+from .agent import AgentBaseClass
+
+
+class RandomAgent(AgentBaseClass):
+    """Agent that selects action uniformly randomly."""
+
+    def __init__(self, o_space, a_space):
+        """Constructor for RandomAgent."""
+        super(RandomAgent, self).__init__(o_space, a_space)
+
+        print('Initialized random agent with {0} actions.'.format(
+            self._num_actions))
+
+        self.episode_count = 0
+        # step_count is incremented each time after receiving reward.
+        self.step_count = 0
+
+    def start(self, state):
+        """Start a new episode."""
+        self.episode_count += 1
+        action, _ = self._choose_action(state)
+        return action, {}
+
+    def step(self, reward, next_state):
+        """Observe one transition and choose an action."""
+        self.step_count += 1
+        action, _ = self._choose_action(next_state)
+        return action, {}
+
+    def end(self, reward, next_state):
+        """Last observed reward/state of the episode (which then terminates)."""
+        self.step_count += 1
+
+    def set_as_best_model(self):
+        """Copy current model to best model."""
+        pass
+
+    def save(self, filename):
+        """Save best model to file."""
+        pass
+
+    def save_parameter_settings(self, filename):
+        """Save parameter settings to file."""
+        pass
+
+    def _choose_action(self, state):
+        """Random policy."""
+        return np.random.randint(self._num_actions), 'RANDOM'
diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/__init__.py b/bindings/python/cntk/contrib/deeprl/agent/shared/__init__.py
new file mode 100644
index 000000000000..8b137891791f
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/shared/__init__.py
@@ -0,0 +1 @@
+
diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/cntk_utils.py b/bindings/python/cntk/contrib/deeprl/agent/shared/cntk_utils.py
new file mode 100644
index 000000000000..478b9ed6d0e4
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/shared/cntk_utils.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Utility functions."""
+
+import cntk.ops as C
+
+
+def huber_loss(output, target):
+    r"""See https://en.wikipedia.org/wiki/Huber_loss for definition.
+
+    \delta is set to 1. This is not the right definition if output and target
+    differ in more than one dimension.
+    """
+    a = target - output
+    return C.reduce_sum(C.element_select(
+        C.less(C.abs(a), 1), C.square(a) * 0.5, C.abs(a) - 0.5))
+
+
+def negative_of_entropy_with_softmax(p):
+    """See https://en.wikipedia.org/wiki/Entropy_(information_theory)."""
+    return C.reduce_sum(C.softmax(p) * p) - C.reduce_log_sum_exp(p)
diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/customized_models.py b/bindings/python/cntk/contrib/deeprl/agent/shared/customized_models.py
new file mode 100644
index 000000000000..8e3985d9b4c8
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/shared/customized_models.py
@@ -0,0 +1,71 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Customized Q function or (unnormalized) log of policy function.
+
+If models from cntk.contrib.deeprl.agent.shared.models are not adequate, write
+your own model as a function, which takes two required arguments
+'shape_of_inputs', 'number_of_outputs', and two optional arguments
+'loss_function', 'use_placeholder_for_input', and outputs a dictionary
+containing 'inputs', 'outputs', 'f' and 'loss'. In the config file, set
+QRepresentation or PolicyRepresentation to path (module_name.function_name) of
+the function. QLearning/PolicyGradient will then automatically search for it.
+"""
+
+import cntk as C
+import numpy as np
+
+
+def conv_dqn(shape_of_inputs,
+             number_of_outputs,
+             loss_function=None,
+             use_placeholder_for_input=False):
+    """Example convolutional neural network for approximating the Q value function.
+
+    This is the model used in the original DQN paper
+    https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf.
+
+    Args:
+        shape_of_inputs: tuple of array (input) dimensions.
+        number_of_outputs: dimension of output, equals the number of
+            possible actions.
+        loss_function: if not specified, use squared loss by default.
+        use_placeholder_for_input: if true, inputs have to be replaced
+            later with actual input_variable.
+
+    Returns: a Python dictionary with string-valued keys including
+        'inputs', 'outputs', 'loss' and 'f'.
+    """
+    # input/output
+    inputs = C.ops.placeholder(shape=shape_of_inputs) \
+        if use_placeholder_for_input \
+        else C.ops.input_variable(shape=shape_of_inputs, dtype=np.float32)
+    outputs = C.ops.input_variable(
+        shape=(number_of_outputs,), dtype=np.float32)
+
+    # network structure
+    centered_inputs = inputs - 128
+    scaled_inputs = centered_inputs / 256
+
+    with C.layers.default_options(activation=C.ops.relu):
+        q = C.layers.Sequential([
+            C.layers.Convolution((8, 8), 32, strides=4),
+            C.layers.Convolution((4, 4), 64, strides=2),
+            C.layers.Convolution((3, 3), 64, strides=2),
+            C.layers.Dense((512,)),
+            C.layers.Dense(number_of_outputs, activation=None)
+        ])(scaled_inputs)
+
+    if loss_function is None:
+        loss = C.losses.squared_error(q, outputs)
+    else:
+        loss = loss_function(q, outputs)
+
+    return {
+        'inputs': inputs,
+        'outputs': outputs,
+        'f': q,
+        'loss': loss
+    }
diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/discretize.py b/bindings/python/cntk/contrib/deeprl/agent/shared/discretize.py
new file mode 100644
index 000000000000..3bfe575290fe
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/shared/discretize.py
@@ -0,0 +1,52 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Discretize continuous environment space."""
+
+import numpy as np
+
+
+class BoxSpaceDiscretizer:
+    """Discretize Box space."""
+
+    def __init__(self, space, resolution):
+        spaceclassname = \
+            space.__class__.__module__ + '.' + space.__class__.__name__
+        if spaceclassname != 'gym.spaces.box.Box':
+            raise ValueError(
+                'Space {0} incompatible with {1}. (Only supports '
+                'Box space)'.format(space, self))
+
+        assert np.isscalar(resolution) or space.low.shape == resolution.shape
+
+        self._state_mins = space.low
+        self._state_maxs = space.high
+        if np.isscalar(resolution):
+            self._state_resolutions = resolution + np.zeros(space.low.shape)
+        else:
+            self._state_resolutions = resolution
+        self.num_states = int(np.prod(self._state_resolutions))
+
+    def discretize(self, value):
+        """Discretize box space observation."""
+        index = 0
+        for i, v in np.ndenumerate(value):
+            i_idx = self._get_index(
+                v,
+                self._state_mins[i],
+                self._state_maxs[i],
+                self._state_resolutions[i])
+            index = index * self._state_resolutions[i] + i_idx
+        return int(index)
+
+    def _get_index(self, value, minv, maxv, res):
+        """Convert a continuous value to a discrete number."""
+        if value >= maxv:
+            return res - 1
+        elif value <= minv:
+            return 0
+        else:
+            ind = np.floor((value - minv) * res / (maxv - minv))
+            return int(min(res - 1, max(0, ind)))
diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/models.py b/bindings/python/cntk/contrib/deeprl/agent/shared/models.py
new file mode 100644
index 000000000000..52fc0a22c32b
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/shared/models.py
@@ -0,0 +1,156 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""A set of predefined models used by Q learning or Actor-Critic."""
+
+import cntk as C
+import numpy as np
+
+import ast
+
+
+class Models:
+    """A set of predefined models to approximate Q or log of pi (policy).
+
+    The loss function needs to be 'cross_entropy_with_softmax' for policy
+    gradient methods.
+    """
+
+    @staticmethod
+    def feedforward_network(shape_of_inputs,
+                            number_of_outputs,
+                            model_hidden_layers,
+                            loss_function=None,
+                            use_placeholder_for_input=False):
+        """Feedforward network to approximate Q or log of pi.
+
+        Args:
+            shape_of_inputs: tuple of array (input) dimensions.
+            number_of_outputs: dimension of output, equals the number of
+                possible actions.
+            model_hidden_layers: string representing a list of integers
+                corresponding to number of nodes in each hidden layer.
+            loss_function: if not specified, use squared loss by default.
+            use_placeholder_for_input: if true, inputs have to be replaced
+                later with actual input_variable.
+
+        Returns: a Python dictionary with string valued keys including
+            'inputs', 'outputs', 'loss' and 'f'.
+        """
+        # input/output
+        inputs = C.ops.placeholder(shape=shape_of_inputs) \
+            if use_placeholder_for_input \
+            else C.ops.input_variable(shape=shape_of_inputs, dtype=np.float32)
+        outputs = C.ops.input_variable(shape=(number_of_outputs,), dtype=np.float32)
+
+        # network structure
+        hidden_layers = ast.literal_eval(model_hidden_layers)
+        f = C.layers.Sequential([
+            C.layers.For(range(len(hidden_layers)),
+                lambda h: C.layers.Dense(hidden_layers[h], activation=C.ops.relu)),
+            C.layers.Dense(number_of_outputs, activation=None)
+        ])(inputs)
+
+        if loss_function is None:
+            loss = C.losses.squared_error(f, outputs)
+        else:
+            loss = loss_function(f, outputs)
+
+        return {
+            'inputs': inputs,
+            'outputs': outputs,
+            'f': f,
+            'loss': loss
+        }
+
+    @staticmethod
+    def dueling_network(shape_of_inputs,
+                        number_of_outputs,
+                        model_hidden_layers,
+                        loss_function=None,
+                        use_placeholder_for_input=False):
+        """Dueling network to approximate Q function.
+
+        See paper at https://arxiv.org/pdf/1511.06581.pdf.
+
+        Args:
+            shape_of_inputs: tuple of array (input) dimensions.
+            number_of_outputs: dimension of output, equals the number of
+                possible actions.
+            model_hidden_layers: in the form of "[comma-separated integers,
+                [comma-separated integers], [comma-separated integers]]". Each
+                integer is the number of nodes in a hidden layer.The
+                first set of integers represent the shared component in dueling
+                network. The second set correponds to the state value function
+                V and the third set correponds to the advantage function A.
+            loss_function: if not specified, use squared loss by default.
+            use_placeholder_for_input: if true, inputs have to be replaced
+                later with actual input_variable.
+
+        Returns: a Python dictionary with string-valued keys including
+            'inputs', 'outputs', 'loss' and 'f'.
+        """
+        # input/output
+        inputs = C.ops.placeholder(shape=shape_of_inputs) \
+            if use_placeholder_for_input \
+            else C.ops.input_variable(shape=shape_of_inputs, dtype=np.float32)
+        outputs = C.ops.input_variable(
+            shape=(number_of_outputs,), dtype=np.float32)
+
+        # network structure
+        shared_hidden_layers, v_hidden_layers, a_hidden_layers =\
+            Models._parse_dueling_network_structure(model_hidden_layers)
+        # shared layers
+        s = C.layers.For(
+            range(len(shared_hidden_layers)),
+            lambda h: C.layers.Dense(shared_hidden_layers[h], activation=C.ops.relu))(inputs)
+        # Value function
+        v = C.layers.Sequential([
+            C.layers.For(
+                range(len(v_hidden_layers)),
+                lambda h: C.layers.Dense(v_hidden_layers[h], activation=C.ops.relu)),
+            C.layers.Dense(1, activation=None)
+        ])(s)
+        # Advantage function
+        a = C.layers.Sequential([
+            C.layers.For(
+                range(len(a_hidden_layers)),
+                lambda h: C.layers.Dense(a_hidden_layers[h], activation=C.ops.relu)),
+            C.layers.Dense(number_of_outputs, activation=None)
+        ])(s)
+        # Q = V + A - avg(A)
+        avg_a = C.layers.AveragePooling((number_of_outputs,))(a)
+        q = v + a - avg_a
+
+        if loss_function is None:
+            loss = C.losses.squared_error(q, outputs)
+        else:
+            loss = loss_function(q, outputs)
+
+        return {
+            'inputs': inputs,
+            'outputs': outputs,
+            'f': q,
+            'loss': loss
+        }
+
+    @staticmethod
+    def _parse_dueling_network_structure(hidden_layers_str):
+        hidden_layers = ast.literal_eval(hidden_layers_str)
+
+        if not (
+            len(hidden_layers) > 2
+                and isinstance(hidden_layers[-1], list)
+                and isinstance(hidden_layers[-2], list)):
+            raise ValueError('Invalid dueling network structure.')
+
+        return\
+            Models._remove_none_elements_from_list(hidden_layers[:-2]),\
+            Models._remove_none_elements_from_list(hidden_layers[-2]),\
+            Models._remove_none_elements_from_list(hidden_layers[-1])
+
+    @staticmethod
+    def _remove_none_elements_from_list(value_list):
+        return [e for e in value_list if e is not None]
diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/policy_gradient_parameters.py b/bindings/python/cntk/contrib/deeprl/agent/shared/policy_gradient_parameters.py
new file mode 100644
index 000000000000..4338e74e242c
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/shared/policy_gradient_parameters.py
@@ -0,0 +1,113 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Policy Gradient parameters."""
+
+import configparser
+
+
+class PolicyGradientParameters:
+    """Parameters used by Policy Gradient algorithms."""
+
+    def __init__(self, config_file):
+        """Read parameter values from config_file.
+
+        Use default value if the parameter is not present.
+        """
+        self.config = configparser.ConfigParser()
+        self.config.optionxform = str
+        self.config.read(config_file)
+
+        # Discount factor.
+        self.gamma = self.config.getfloat(
+            'General', 'Gamma', fallback=0.95)
+
+        # Name of class that does preprocessing.
+        self.preprocessing = self.config.get(
+            'General', 'PreProcessing', fallback='')
+
+        # Arguments (except the first argument input_shape) of preprocessing as
+        # a tuple.
+        self.preprocessing_args = self.config.get(
+            'General', 'PreProcessingArgs', fallback='()')
+
+        # If true, policy pi and value function V share all non-output layers.
+        # PolicyRepresentation (and/or PolicyNetworkHiddenLayers) define
+        # structure for all non-output layers. Policy then has one softmax
+        # output layer, and value function has one linear output layer. If
+        # false, all non-output layers of policy are still specified by
+        # PolicyRepresentation. This is equivalent to defining unnormalized log
+        # of policy pi. The value function, however, is completely specified by
+        # ValueFunctionRepresentation (and/or ValueNetworkHiddenLayers), which
+        # outputs a scalar.
+        self.shared_representation = self.config.getboolean(
+            'PolicyGradient', 'SharedRepresentation', fallback=False)
+
+        # Representation of policy.
+        self.policy_representation = self.config.get(
+            'PolicyGradient', 'PolicyRepresentation', fallback='nn')
+
+        # Suppose gradient of policy network is g, gradient of value network
+        # is gv, during each update, policy network is updated as
+        # \theta <- \theta + \eta * g where \eta is learning rate, and
+        # value network is updated as
+        # \theta_v <- \theta_v + \eta * relative_step_size * gv. This allows
+        # policy network and value network to be updated at different learning
+        # rates. Alternatively, this can be viewed as relative weight between
+        # policy loss and value function loss.
+        self.relative_step_size = self.config.getfloat(
+            'PolicyGradient', 'RelativeStepSize', fallback=0.5)
+
+        # Weight of regularization term.
+        self.regularization_weight = self.config.getfloat(
+            'PolicyGradient', 'RegularizationWeight', fallback=0.001)
+
+        # Number of nodes in each hidden layer of policy network.
+        self.policy_network_hidden_layers = self.config.get(
+            'NetworkModel', 'PolicyNetworkHiddenLayerNodes', fallback='[10]')
+
+        # Representation of value function.
+        self.value_function_representation = self.config.get(
+            'PolicyGradient', 'ValueFunctionRepresentation', fallback='nn')
+
+        # Number of nodes in each hidden layer of value network.
+        self.value_network_hidden_layers = self.config.get(
+            'NetworkModel', 'ValueNetworkHiddenLayerNodes', fallback='[10]')
+
+        # Initial value of eta, which is the learning rate for gradient descent.
+        self.initial_eta = self.config.getfloat(
+            'Optimization', 'InitialEta', fallback=0.001)
+
+        # Number of steps before eta reaches minimum value.
+        self.eta_decay_step_count = self.config.getint(
+            'Optimization', 'EtaDecayStepCount', fallback=100000)
+
+        # Minimum value of eta. Since Adam is used as the optimizer, a good
+        # starting point is to set EtaMinimum equal to InitialEta, which is
+        # equivalent to using a constant global learning rate cap, while Adam
+        # continuously adapts individual parameter learning rates.
+        self.eta_minimum = self.config.getfloat(
+            'Optimization', 'EtaMinimum', fallback=0.001)
+
+        # Momentum used by Adam.
+        self.momentum = self.config.getfloat(
+            'Optimization', 'Momentum', fallback=0.95)
+
+        # Update frequency for policy network and value network, in the number
+        # of time steps.
+        self.update_frequency = self.config.getint(
+            'PolicyGradient', 'UpdateFrequency', fallback=64)
+
+        # Name of a file containing model of the same structure as policy
+        # network (unnormalized log of policy pi), where model is obtained
+        # through other methods (e.g. supervised learning), and saved by
+        # cntk.ops.functions.Function.save(). Random initialization is
+        # performed if value is empty.
+        self.initial_policy_network = self.config.get(
+            'PolicyGradient', 'InitialPolicy', fallback='')
+
+    def save(self, config_file):
+        with open(config_file, 'w') as c:
+            self.config.write(c)
diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/preprocessing.py b/bindings/python/cntk/contrib/deeprl/agent/shared/preprocessing.py
new file mode 100644
index 000000000000..6db68521e81c
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/shared/preprocessing.py
@@ -0,0 +1,125 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Base class for defining preprocessing, as well as two concrete examples."""
+
+from abc import ABCMeta, abstractmethod
+from collections import deque
+
+import numpy as np
+from PIL import Image
+
+
+class Preprocessing(object):
+    """Base class for defining preprocessing.
+
+    All subclass constructors will take input_shape as the first argument.
+    """
+
+    __metaclass__ = ABCMeta
+
+    def __init__(self, input_shape):
+        """Constructor for base Preprocessing class."""
+        self._input_shape = input_shape
+
+    @abstractmethod
+    def output_shape(self):
+        """Return shape of preprocessed observation."""
+        pass
+
+    @abstractmethod
+    def reset(self):
+        """Reset preprocessing pipeline for new episode."""
+        pass
+
+    @abstractmethod
+    def preprocess(self, observation):
+        """Return preprocessed observation."""
+        pass
+
+
+class AtariPreprocessing(Preprocessing):
+    """Preprocess screen images from Atari 2600 games.
+
+    The image is represented by an array of shape (210, 160, 3). See
+    https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
+    for more details.
+    """
+
+    def __init__(self, input_shape, history_len=4):
+        super(AtariPreprocessing, self).__init__(input_shape)
+        self.__history_len = history_len
+        self.__processed_image_seq = deque(maxlen=history_len)
+        self.reset()
+
+    def output_shape(self):
+        """Return shape of preprocessed Atari images."""
+        return (self.__history_len, 84, 84)
+
+    def reset(self):
+        """Reset preprocessing pipeline for new episode."""
+        self.__previous_raw_image = np.zeros(self._input_shape, dtype=np.uint8)
+        self.__processed_image_seq.clear()
+        for i in range(self.__history_len):
+            self.__processed_image_seq.append(np.zeros((84, 84)))
+
+    def preprocess(self, image):
+        """Return preprocessed screen images from Atari 2600 games."""
+        if image.shape != self._input_shape:
+            raise ValueError(
+                'Expecting image in shape {0} but get {1}\n'.format(
+                    self._input_shape, image.shape))
+
+        # Take the maximum value for each pixel over the current frame and the
+        # previous one.
+        im = Image.fromarray(
+            np.maximum(image, self.__previous_raw_image), mode='RGB')
+
+        # Extract luminance band.
+        im = im.convert('YCbCr').split()[0]
+
+        # Scale to 84 x 84
+        im = im.resize((84, 84), Image.BILINEAR)
+
+        self.__processed_image_seq.append(np.array(im))
+        self.__previous_raw_image = image
+
+        return np.stack(list(self.__processed_image_seq))
+
+
+class SlidingWindow(Preprocessing):
+    """Stack windowed inputs (x(t-m+1), ... x(t))."""
+
+    def __init__(self, input_shape, history_len=4, dtype=np.float32):
+        super(SlidingWindow, self).__init__(input_shape)
+        self.__dtype = dtype
+        self.__history_len = history_len
+        self.__history = deque(maxlen=history_len)
+        self.reset()
+
+    def output_shape(self):
+        """Return shape of preprocessed input."""
+        return (self.__history_len,) + self._input_shape
+
+    def reset(self):
+        """Reset preprocessing pipeline for new episode."""
+        self.__history.clear()
+        for i in range(self.__history_len):
+            self.__history.append(np.zeros(self._input_shape, self.__dtype))
+
+    def preprocess(self, x):
+        """Return preprocessed input x."""
+        if x.shape != self._input_shape:
+            raise ValueError(
+                'Expecting input in shape {0} but get {1}\n'.format(
+                    self._input_shape, x.shape))
+
+        if x.dtype != self.__dtype:
+            raise ValueError(
+                'Expecting input in dtype {0} but get {1}\n'.format(
+                    self.__dtype, x.dtype))
+
+        self.__history.append(x)
+        return np.stack(list(self.__history))
diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/qlearning_parameters.py b/bindings/python/cntk/contrib/deeprl/agent/shared/qlearning_parameters.py
new file mode 100644
index 000000000000..bf4bcb0efb9d
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/shared/qlearning_parameters.py
@@ -0,0 +1,155 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Q learning parameters."""
+
+import numpy as np
+
+import ast
+import configparser
+
+
+class QLearningParameters:
+    """Parameters used by Q learning algorithm."""
+
+    def __init__(self, config_file):
+        """Read parameter values from config_file.
+
+        Use default value if the value is not present.
+        """
+        # TODO: validate parameter values.
+        self.config = configparser.ConfigParser()
+        self.config.optionxform = str
+        self.config.read(config_file)
+
+        # Discount factor
+        self.gamma = self.config.getfloat(
+            'General', 'Gamma', fallback=0.95)
+
+        # Name of class that does preprocessing.
+        self.preprocessing = self.config.get(
+            'General', 'PreProcessing', fallback='')
+
+        # Arguments (except the first argument input_shape) of preprocessing as
+        # a tuple.
+        self.preprocessing_args = self.config.get(
+            'General', 'PreProcessingArgs', fallback='()')
+
+        # Representation of Q function, taking value from {'tabular', 'nn'}.
+        self.q_representation = self.config.get(
+            'QLearningAlgo', 'QRepresentation', fallback='tabular')
+
+        # Initial value of epsilon (exploration rate), used by epsilon-greedy
+        # policy.
+        self.initial_epsilon = self.config.getfloat(
+            'QLearningAlgo', 'InitialEpsilon', fallback=0.1)
+
+        # Number of steps before epsilon reaches minimum value.
+        self.epsilon_decay_step_count = self.config.getint(
+            'QLearningAlgo', 'EpsilonDecayStepCount', fallback=100000)
+
+        # Minimum value of epsilon.
+        self.epsilon_minimum = self.config.getfloat(
+            'QLearningAlgo', 'EpsilonMinimum', fallback=0.01)
+
+        # Initial value of eta, which is the learning rate for gradient
+        # descent.
+        self.initial_eta = self.config.getfloat(
+            'Optimization', 'InitialEta', fallback=0.001)
+
+        # Number of steps before eta reaches minimum value.
+        self.eta_decay_step_count = self.config.getint(
+            'Optimization', 'EtaDecayStepCount', fallback=100000)
+
+        # Minimum value of eta. Since Adam is used as the optimizer, a good
+        # starting point is to set EtaMinimum equal to InitialEta, which is
+        # equivalent to using a constant learning rate.
+        self.eta_minimum = self.config.getfloat(
+            'Optimization', 'EtaMinimum', fallback=0.001)
+
+        # Momentum used by RMSProp.
+        self.momentum = self.config.getfloat(
+            'Optimization', 'Momentum', fallback=0.95)
+
+        # Initial value for table entries.
+        # TODO(maoyi): allow DQN initialization through config file.
+        self.initial_q = self.config.getfloat(
+            'QLearningAlgo', 'InitialQ', fallback=0.0)
+
+        # Number of partitions for discretizing the continuous space. Either a
+        # scalar which is applied to all dimensions, or a list specifying
+        # different value for different dimension.
+        self.discretization_resolution = ast.literal_eval(self.config.get(
+            'QLearningAlgo', 'DiscretizationResolution', fallback='10'))
+        if isinstance(self.discretization_resolution, list):
+            self.discretization_resolution = np.array(
+                self.discretization_resolution)
+
+        # Number of actions chosen between successive
+        # target network updates.
+        self.target_q_update_frequency = self.config.getint(
+            'QLearningAlgo', 'TargetQUpdateFrequency', fallback=10000)
+
+        # Sample size of each minibatch.
+        self.minibatch_size = self.config.getint(
+            'QLearningAlgo', 'MinibatchSize', fallback=32)
+
+        # Number of replays per update.
+        self.replays_per_update = self.config.getint(
+            'QLearningAlgo', 'ReplaysPerUpdate', fallback=1)
+
+        # Number of actions chosen between successive SGD updates of Q.
+        self.q_update_frequency = self.config.getint(
+            'QLearningAlgo', 'QUpdateFrequency', fallback=4)
+
+        # Use Huber loss with \delta=1 when True. Otherwise, use least square
+        # loss.
+        self.use_error_clipping = self.config.getboolean(
+            'QLearningAlgo', 'ErrorClipping', fallback=True)
+
+        # Capacity of replay memory.
+        self.replay_memory_capacity = self.config.getint(
+            'ExperienceReplay', 'Capacity', fallback=100000)
+
+        # A uniform random policy is run for this number of steps to populate
+        # replay memory.
+        self.replay_start_size = self.config.getint(
+            'ExperienceReplay', 'StartSize', fallback=5000)
+
+        # Use prioritized replay. Fall back to uniform sampling when False .
+        self.use_prioritized_replay = self.config.getboolean(
+            'ExperienceReplay', 'Prioritized', fallback=False)
+
+        # Used by prioritized replay, to determine how much prioritization is
+        # used, with 0 corresponding to uniform.
+        self.priority_alpha = self.config.getfloat(
+            'ExperienceReplay', 'PriorityAlpha', fallback=0.7)
+
+        # Used by prioritized replay, to anneal the amount of importance
+        # sampling correction.
+        self.priority_beta = self.config.getfloat(
+            'ExperienceReplay', 'PriorityBeta', fallback=0.5)
+
+        # Used by prioritized replay, to prevent transitions not being visited
+        # once their error is zero.
+        self.priority_epsilon = self.config.getfloat(
+            'ExperienceReplay', 'PriorityEpsilon', fallback=0.01)
+
+        # Number of nodes in each hidden layer, starting after the input layer.
+        self.hidden_layers = self.config.get(
+            'NetworkModel', 'HiddenLayerNodes', fallback='[20]')
+
+        # Maximum norm of gradient per sample. No gradient clipping if the
+        # parameter is missing from the config file.
+        self.gradient_clipping_threshold = self.config.getfloat(
+            'Optimization', 'GradientClippingThreshold', fallback=np.inf)
+
+        # Use Double Q-learning if true.
+        self.double_q_learning = self.config.getboolean(
+            'QLearningAlgo', 'DoubleQLearning', fallback=False)
+
+    def save(self, config_file):
+        with open(config_file, 'w') as c:
+            self.config.write(c)
diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/replay_memory.py b/bindings/python/cntk/contrib/deeprl/agent/shared/replay_memory.py
new file mode 100644
index 000000000000..8068bfcd1466
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/shared/replay_memory.py
@@ -0,0 +1,163 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Replay memory for Q learning."""
+
+from __future__ import division
+
+import math
+import random
+from collections import namedtuple
+
+# Transition for experience replay.
+#
+# Args:
+#   state: current state.
+#   action: action applied to current state.
+#   reward: scalar representing reward received by applying action to
+#     current state.
+#   next_state: the new state after action is applied.
+#   priority: associated priority.
+_Transition = namedtuple('Transition',
+                         ['state', 'action', 'reward', 'next_state',
+                          'priority'])
+
+
+class ReplayMemory:
+    """Replay memory to store samples of experience.
+
+    Each transition is represented as (state, action, reward, next_state,
+    priority) tuple. 'priority' is ignored for non-prioritized experience
+    replay.
+    """
+
+    def __init__(self, capacity, prioritized=False):
+        """Create replay memory with size capacity."""
+        self._use_prioritized_replay = prioritized
+        self._capacity = capacity
+        # Position in the list where new experience will be written to.
+        self._position = 0
+        # For prioritized replay, 'sum-tree' data structure is used.
+        # Transitions are stored in leaf nodes, while internal nodes store the
+        # sum of priorities from all its descendants. List is used to represent
+        # this complete binary tree. The following code initializes
+        # all internal nodes, if any, to have value 0.
+        self._memory = [0] * (capacity - 1) if prioritized else []
+
+    def store(self, *args):
+        """Store a transition in replay memory.
+
+        If the memory is full, the oldest one gets overwritten.
+        """
+        if not self._isfull():
+            self._memory.append(None)
+        position = self._next_position_then_increment()
+        old_priority = 0 if self._memory[position] is None \
+            else self._memory[position].priority
+        transition = _Transition(*args)
+        self._memory[position] = transition
+        if self._use_prioritized_replay:
+            self._update_internal_nodes(
+                position, transition.priority - old_priority)
+
+    def update_priority(self, map_from_position_to_priority):
+        """Update priority of transitions.
+
+        Args:
+            map_from_position_to_priority: dictionary mapping position of
+                transition to its new priority. position should come from
+                tuples returned by sample_minibatch().
+        """
+        if not self._use_prioritized_replay:
+            return
+        for position, new_priority in map_from_position_to_priority.items():
+            old_priority = self._memory[position].priority
+            self._memory[position] = _Transition(
+                self._memory[position].state,
+                self._memory[position].action,
+                self._memory[position].reward,
+                self._memory[position].next_state,
+                new_priority)
+            self._update_internal_nodes(
+                position, new_priority - old_priority)
+
+    def _actual_capacity(self):
+        """Actual capacity needed.
+
+        For prioritized replay, this includes both leaf nodes containing
+        transitions and internal nodes containing priority sum.
+        """
+        return 2 * self._capacity - 1 \
+            if self._use_prioritized_replay \
+            else self._capacity
+
+    def _isfull(self):
+        return len(self._memory) == self._actual_capacity()
+
+    def _next_position_then_increment(self):
+        """Similar to position++."""
+        start = self._capacity - 1 \
+            if self._use_prioritized_replay \
+            else 0
+        position = start + self._position
+        self._position = (self._position + 1) % self._capacity
+        return position
+
+    def _update_internal_nodes(self, index, delta):
+        """Update internal priority sums when leaf priority has been changed.
+
+        Args:
+            index: leaf node index
+            delta: change in priority
+        """
+        while index > 0:
+            index = (index - 1) // 2
+            self._memory[index] += delta
+
+    def size(self):
+        """Return the current number of transitions."""
+        l = len(self._memory)
+        if self._use_prioritized_replay:
+            l -= (self._capacity - 1)
+        return l
+
+    def sample_minibatch(self, batch_size):
+        """Sample minibatch of size batch_size."""
+        pool_size = self.size()
+        if pool_size == 0:
+            return []
+
+        if not self._use_prioritized_replay:
+            chosen_idx = range(pool_size) \
+                if pool_size <= batch_size \
+                else random.sample(range(pool_size), batch_size)
+        else:
+            delta_p = self._memory[0] / batch_size
+            chosen_idx = []
+            for i in range(batch_size):
+                lower = max(i * delta_p, 0)
+                upper = min((i + 1) * delta_p, self._memory[0])
+                p = random.uniform(lower, upper)
+                chosen_idx.append(self._sample_with_priority(p))
+
+        return [(i, self._memory[i]) for i in chosen_idx]
+
+    def _sample_with_priority(self, p):
+        parent = 0
+        while True:
+            left = 2 * parent + 1
+            if left >= len(self._memory):
+                # parent points to a leaf node already.
+                return parent
+
+            left_p = self._memory[left] if left < self._capacity - 1 \
+                else self._memory[left].priority
+            if p <= left_p:
+                parent = left
+            else:
+                if left + 1 >= len(self._memory):
+                    raise RuntimeError('Right child is expected to exist.')
+                p -= left_p
+                parent = left + 1
diff --git a/bindings/python/cntk/contrib/deeprl/agent/tabular_qlearning.py b/bindings/python/cntk/contrib/deeprl/agent/tabular_qlearning.py
new file mode 100644
index 000000000000..a0790ee5cf04
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/agent/tabular_qlearning.py
@@ -0,0 +1,121 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""Tabular Q-learning."""
+
+import copy
+
+import numpy as np
+
+from .agent import AgentBaseClass
+from .shared.qlearning_parameters import QLearningParameters
+
+
+class TabularQLearning(AgentBaseClass):
+    """Q-learning agent with tabular representation."""
+
+    def __init__(self, cfg_filename, o_space, a_space):
+        """Constructor for Q learning algorithm with tabular representation."""
+        super(TabularQLearning, self).__init__(o_space, a_space)
+
+        self._parameters = QLearningParameters(cfg_filename)
+        if self._parameters.q_representation != 'tabular':
+            raise ValueError(
+                'Unexpected representation for tabular Q-learning: "{0}"'
+                '\n'.format(self._parameters.q_representation))
+
+        # Discretize the observation space if necessary
+        if self._classname(o_space) != 'gym.spaces.discrete.Discrete':
+            self._discretize_observation_space(
+                o_space, self._parameters.discretization_resolution)
+
+        self._q = self._parameters.initial_q + \
+            np.zeros((self._num_states, self._num_actions))
+        print('Initialized discrete Q-learning agent with {0} states and '
+              '{1} actions.'.format(self._num_states, self._num_actions))
+
+        self.episode_count = 0
+        # step_count is incremented each time after receiving reward.
+        self.step_count = 0
+
+    def start(self, state):
+        """Start a new episode."""
+        self._adjust_exploration_rate()
+        self._last_state = self._preprocess_state(state)
+        self._last_action, action_behavior = \
+            self._choose_action(self._last_state)
+        self.episode_count += 1
+        return self._last_action, {
+            'action_behavior': action_behavior,
+            'epsilon': self._epsilon}
+
+    def step(self, reward, next_state):
+        """Observe one transition and choose an action."""
+        self._adjust_learning_rate()
+        self.step_count += 1
+
+        next_encoded_state = self._preprocess_state(next_state)
+        td_err = reward + self._parameters.gamma * \
+            np.max(self._q[next_encoded_state]) - \
+            self._q[self._last_state, self._last_action]
+        self._q[self._last_state, self._last_action] += self._eta * td_err
+
+        self._adjust_exploration_rate()
+        self._last_state = next_encoded_state
+        self._last_action, action_behavior = self._choose_action(
+            self._last_state)
+        return self._last_action, {
+            'action_behavior': action_behavior,
+            'epsilon': self._epsilon}
+
+    def end(self, reward, next_state):
+        """Last observed reward/state of the episode (which then terminates)."""
+        self._adjust_learning_rate()
+        self.step_count += 1
+
+        td_err = reward - self._q[self._last_state, self._last_action]
+        self._q[self._last_state, self._last_action] += self._eta * td_err
+
+    def set_as_best_model(self):
+        """Copy current model to best model."""
+        self._best_model = copy.deepcopy(self._q)
+
+    def save(self, filename):
+        """Save best model to file."""
+        with open(filename, 'w') as f:
+            for s in range(self._num_states):
+                f.write('{0}\t{1}\n'.format(s, str(self._best_model[s])))
+
+    def save_parameter_settings(self, filename):
+        """Save parameter settings to file."""
+        self._parameters.save(filename)
+
+    def enter_evaluation(self):
+        """Setup before evaluation."""
+        self._epsilon = 0
+
+    def _adjust_learning_rate(self):
+        self._eta = self._parameters.eta_minimum + max(
+            0,
+            (self._parameters.initial_eta - self._parameters.eta_minimum) *
+            (1 - float(self.step_count)/self._parameters.eta_decay_step_count))
+
+    def _adjust_exploration_rate(self):
+        self._epsilon = self._parameters.epsilon_minimum + max(
+            0,
+            (self._parameters.initial_epsilon - self._parameters.epsilon_minimum) *
+            (1 - float(self.step_count)/self._parameters.epsilon_decay_step_count))
+
+    def _choose_action(self, state):
+        """Epsilon greedy policy."""
+        if np.random.uniform(0, 1) < self._epsilon:
+            return np.random.randint(self._num_actions), 'RANDOM'
+        else:
+            return np.argmax(self._q[state]), 'GREEDY'
+
+    def _preprocess_state(self, state):
+        """Discretize state to table row index."""
+        o = self._discretize_state_if_necessary(state)
+        return o
diff --git a/bindings/python/cntk/contrib/deeprl/tests/agent_test.py b/bindings/python/cntk/contrib/deeprl/tests/agent_test.py
new file mode 100644
index 000000000000..a44c200fbe0e
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/tests/agent_test.py
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import unittest
+
+import cntk.contrib.deeprl.tests.spaces as spaces
+from cntk.contrib.deeprl.agent.agent import AgentBaseClass
+
+
+class AgentBaseClassTest(unittest.TestCase):
+    """Unit tests for AgentBaseClass."""
+
+    def test_init_unsupported_action_space(self):
+        action_space = spaces.Box(0, 1, (1,))
+        observation_space = spaces.Discrete(3)
+        self.assertRaises(
+            ValueError, AgentBaseClass, observation_space, action_space)
+
+    def test_init_unsupported_observation_space(self):
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Tuple(
+            [spaces.Discrete(3), spaces.Discrete(3)])
+        self.assertRaises(
+            ValueError, AgentBaseClass, observation_space, action_space)
+
+    def test_init_discrete_observation_space(self):
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Discrete(3)
+        sut = AgentBaseClass(observation_space, action_space)
+
+        self.assertEqual(sut._num_actions, 2)
+        self.assertEqual(sut._num_states, 3)
+        self.assertEqual(sut._shape_of_inputs, (3, ))
+        self.assertTrue(sut._discrete_observation_space)
+        self.assertIsNone(sut._space_discretizer)
+        self.assertIsNone(sut._preprocessor)
+
+    def test_init_multibinary_observation_space(self):
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.MultiBinary(3)
+        sut = AgentBaseClass(observation_space, action_space)
+
+        self.assertEqual(sut._num_actions, 2)
+        self.assertIsNone(sut._num_states)
+        self.assertEqual(sut._shape_of_inputs, (3, ))
+        self.assertFalse(sut._discrete_observation_space)
+        self.assertIsNone(sut._space_discretizer)
+        self.assertIsNone(sut._preprocessor)
+
+    def test_init_box_observation_space(self):
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = AgentBaseClass(observation_space, action_space)
+
+        self.assertEqual(sut._num_actions, 2)
+        self.assertIsNone(sut._num_states)
+        self.assertEqual(sut._shape_of_inputs, (1, ))
+        self.assertFalse(sut._discrete_observation_space)
+        self.assertIsNone(sut._space_discretizer)
+        self.assertIsNone(sut._preprocessor)
diff --git a/bindings/python/cntk/contrib/deeprl/tests/cntk_utils_test.py b/bindings/python/cntk/contrib/deeprl/tests/cntk_utils_test.py
new file mode 100644
index 000000000000..f5976f0e5d04
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/tests/cntk_utils_test.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import unittest
+
+import numpy as np
+
+from cntk.contrib.deeprl.agent.shared.cntk_utils import (huber_loss,
+                                                         negative_of_entropy_with_softmax)
+from cntk.ops import input_variable
+
+
+class CNTKUtilsTest(unittest.TestCase):
+    """Unit tests for cntk_utils."""
+
+    def test_huber_loss(self):
+        i1 = input_variable((2))
+        i2 = input_variable((2))
+
+        np.testing.assert_array_equal(
+            huber_loss(i1, i2).eval({
+                i1: [[2, 1], [1, 5]],
+                i2: [[4, 1], [1, 4]]
+            }),
+            [1.5, 0.5]
+        )
+
+    def test_entropy(self):
+        i = input_variable((2))
+
+        np.testing.assert_almost_equal(
+            negative_of_entropy_with_softmax(i).eval({
+                i: [[0.5, 0.5], [1000, 1]]
+            }),
+            [-0.693147181, 0]
+        )
diff --git a/bindings/python/cntk/contrib/deeprl/tests/data/initial_policy_network.dnn b/bindings/python/cntk/contrib/deeprl/tests/data/initial_policy_network.dnn
new file mode 100644
index 000000000000..5c9b1fb2c814
Binary files /dev/null and b/bindings/python/cntk/contrib/deeprl/tests/data/initial_policy_network.dnn differ
diff --git a/bindings/python/cntk/contrib/deeprl/tests/discretize_test.py b/bindings/python/cntk/contrib/deeprl/tests/discretize_test.py
new file mode 100644
index 000000000000..7222b87802cc
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/tests/discretize_test.py
@@ -0,0 +1,65 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import unittest
+
+import cntk.contrib.deeprl.tests.spaces as spaces
+import numpy as np
+from cntk.contrib.deeprl.agent.shared.discretize import BoxSpaceDiscretizer
+
+
+class BoxSpaceDiscretizerTest(unittest.TestCase):
+    """Unit tests for BoxSpaceDiscretizer."""
+
+    def test_scalar(self):
+        s = spaces.Box(0, 1, (2,))
+        sut = BoxSpaceDiscretizer(s, 10)
+
+        self.assertEqual(sut.discretize([0, 0]), 0)
+        self.assertEqual(sut.discretize([0.05, 0]), 0)
+        self.assertEqual(sut.discretize([0.95, 0]), 90)
+        self.assertEqual(sut.discretize([0, 0.05]), 0)
+        self.assertEqual(sut.discretize([0, 0.95]), 9)
+        self.assertEqual(sut.discretize([0.1, 0.2]), 12)
+        self.assertEqual(sut.discretize([1, 1]), 99)
+
+    def test_list(self):
+        s = spaces.Box(0, 1, (2,))
+        sut = BoxSpaceDiscretizer(s, np.array([10, 2]))
+
+        self.assertEqual(sut.discretize([0, 0]), 0)
+        self.assertEqual(sut.discretize([0.05, 0]), 0)
+        self.assertEqual(sut.discretize([0.95, 0]), 18)
+        self.assertEqual(sut.discretize([0, 0.05]), 0)
+        self.assertEqual(sut.discretize([0, 0.95]), 1)
+        self.assertEqual(sut.discretize([0.1, 0.2]), 2)
+        self.assertEqual(sut.discretize([1, 1]), 19)
+
+        sut = BoxSpaceDiscretizer(s, np.array([10, 1]))
+
+        self.assertEqual(sut.discretize([0, 0]), 0)
+        self.assertEqual(sut.discretize([0.05, 0]), 0)
+        self.assertEqual(sut.discretize([0.95, 0]), 9)
+        self.assertEqual(sut.discretize([0, 0.05]), 0)
+        self.assertEqual(sut.discretize([0, 0.95]), 0)
+        self.assertEqual(sut.discretize([0.1, 0.2]), 1)
+        self.assertEqual(sut.discretize([1, 1]), 9)
+
+    def test_array(self):
+        s = spaces.Box(0, 1, (2, 2))
+        sut = BoxSpaceDiscretizer(s, np.array([[2, 2], [2, 2]]))
+
+        self.assertEqual(sut.discretize([[0, 0], [0, 0]]), 0)
+        self.assertEqual(sut.discretize([[0.05, 0], [0, 0]]), 0)
+        self.assertEqual(sut.discretize([[0.95, 0], [0, 0]]), 8)
+        self.assertEqual(sut.discretize([[0, 0.05], [0, 0]]), 0)
+        self.assertEqual(sut.discretize([[0, 0.95], [0, 0]]), 4)
+        self.assertEqual(sut.discretize([[0, 0], [0.05, 0]]), 0)
+        self.assertEqual(sut.discretize([[0, 0], [0.95, 0]]), 2)
+        self.assertEqual(sut.discretize([[0, 0], [0, 0.05]]), 0)
+        self.assertEqual(sut.discretize([[0, 0], [0, 0.95]]), 1)
+        self.assertEqual(sut.discretize([[0.1, 0.6], [0.5, 0.2]]), 6)
+        self.assertEqual(sut.discretize([[1, 1], [1, 1]]), 15)
diff --git a/bindings/python/cntk/contrib/deeprl/tests/models_test.py b/bindings/python/cntk/contrib/deeprl/tests/models_test.py
new file mode 100644
index 000000000000..b661b6d32548
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/tests/models_test.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import unittest
+
+from cntk.contrib.deeprl.agent.shared.models import Models
+
+
+class ModelsTest(unittest.TestCase):
+    """Unit tests for Models."""
+
+    def test_parse_dueling_network_structure(self):
+        a, b, c =\
+            Models._parse_dueling_network_structure(
+                    "[1, 2, [3], [4, 5]]")
+        self.assertEqual(a, [1, 2])
+        self.assertIsInstance(a[0], int)
+        self.assertEqual(b, [3])
+        self.assertEqual(c, [4, 5])
+
+        a, b, c =\
+            Models._parse_dueling_network_structure(
+                "[None, [3], [None]]")
+        self.assertEqual(a, [])
+        self.assertEqual(b, [3])
+        self.assertEqual(c, [])
diff --git a/bindings/python/cntk/contrib/deeprl/tests/policy_gradient_test.py b/bindings/python/cntk/contrib/deeprl/tests/policy_gradient_test.py
new file mode 100644
index 000000000000..07facc9d0746
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/tests/policy_gradient_test.py
@@ -0,0 +1,421 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import unittest
+from unittest.mock import MagicMock, Mock, patch
+
+import cntk.contrib.deeprl.tests.spaces as spaces
+import numpy as np
+from cntk.contrib.deeprl.agent.policy_gradient import ActorCritic
+from cntk.layers import Dense
+from cntk.losses import cross_entropy_with_softmax
+from cntk.ops import input_variable, placeholder
+
+
+class PolicyGradientTest(unittest.TestCase):
+    """Unit tests for policy gradient."""
+
+    @patch('cntk.contrib.deeprl.agent.policy_gradient.Models.feedforward_network')
+    def test_init(self, mock_model):
+        mock_model.side_effect = self._setup_test_model
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = ActorCritic('', observation_space, action_space)
+
+        self.assertEqual(sut._num_actions, 2)
+        self.assertIsNone(sut._num_states)
+        self.assertEqual(sut._shape_of_inputs, (1,))
+        self.assertFalse(sut._discrete_observation_space)
+        self.assertIsNone(sut._space_discretizer)
+        self.assertIsNone(sut._preprocessor)
+        self.assertEqual(mock_model.call_count, 2)
+        mock_model.assert_has_calls(
+            [
+                unittest.mock.call((1,), 2, '[10]', cross_entropy_with_softmax,
+                    use_placeholder_for_input=True),
+                unittest.mock.call((1,), 1, '[10]', use_placeholder_for_input=True)
+            ],
+            any_order=True)
+
+    @unittest.skip("Skip this as CNTK can't reset UID during test.")
+    @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters')
+    def test_init_from_existing_model(self, mock_parameters):
+        action_space = spaces.Discrete(3)
+        observation_space = spaces.Box(
+            np.array([-1.2, -0.07]), np.array([0.6, 0.07]))
+        mock_parameters.return_value.policy_representation = 'nn'
+        mock_parameters.return_value.policy_network_hidden_layers = '[2]'
+        mock_parameters.return_value.initial_policy_network = \
+            'tests/data/initial_policy_network.dnn'
+        mock_parameters.return_value.preprocessing = ''
+
+        sut = ActorCritic('', observation_space, action_space)
+
+        self.assertEqual(sut._num_actions, 3)
+        self.assertIsNone(sut._num_states)
+        self.assertEqual(sut._shape_of_inputs, (2,))
+        self.assertFalse(sut._discrete_observation_space)
+        self.assertIsNone(sut._space_discretizer)
+        self.assertIsNone(sut._preprocessor)
+
+        # Incompatible network structure.
+        mock_parameters.return_value.policy_network_hidden_layers = '[]'
+        self.assertRaises(
+            Exception, ActorCritic, '', observation_space, action_space)
+
+        # Incompatible action space.
+        mock_parameters.return_value.policy_network_hidden_layers = '[2]'
+        action_space = spaces.Discrete(2)
+        self.assertRaises(
+            ValueError, ActorCritic, '', observation_space, action_space)
+
+        # Incompatible observation space.
+        action_space = spaces.Discrete(3)
+        observation_space = spaces.Box(
+            np.array([-1.2, -0.07, -1.0]), np.array([0.6, 0.07, 1.0]))
+        self.assertRaises(
+            ValueError, ActorCritic, '', observation_space, action_space)
+
+    @patch('cntk.contrib.deeprl.agent.policy_gradient.Models.feedforward_network')
+    @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters')
+    def test_init_preprocess(self, mock_parameters, mock_model):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.preprocessing = \
+            'cntk.contrib.deeprl.agent.shared.preprocessing.SlidingWindow'
+        mock_parameters.return_value.preprocessing_args = '(2, )'
+        mock_model.side_effect = self._setup_test_model
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = ActorCritic('', observation_space, action_space)
+
+        self.assertIsNotNone(sut._preprocessor)
+        self.assertEqual(sut._preprocessor.output_shape(), (2, 1))
+        self.assertEqual(mock_model.call_count, 2)
+        mock_model.assert_has_calls(
+            [
+                unittest.mock.call((2, 1), 2, '[2]', cross_entropy_with_softmax,
+                    use_placeholder_for_input=True),
+                unittest.mock.call((2, 1), 1, '[2]', use_placeholder_for_input=True)
+            ],
+            any_order=True)
+
+    @patch('cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn')
+    @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters')
+    def test_init_customized_model(self, mock_parameters, mock_model):
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.policy_representation = \
+            'cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn'
+        mock_parameters.return_value.value_function_representation = \
+            'cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn'
+        mock_model.side_effect = self._setup_test_model
+
+        sut = ActorCritic('', observation_space, action_space)
+
+        self.assertEqual(mock_model.call_count, 2)
+        mock_model.assert_has_calls(
+            [
+                unittest.mock.call((1,), 2, cross_entropy_with_softmax,
+                    use_placeholder_for_input=True),
+                unittest.mock.call((1,), 1, use_placeholder_for_input=True)
+            ],
+            any_order=True)
+
+    @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters')
+    def test_init_unsupported_model(self, mock_parameters):
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        self._setup_parameters(mock_parameters.return_value)
+
+        # Verify sut can be constructed.
+        sut = ActorCritic('', observation_space, action_space)
+
+        mock_parameters.return_value.policy_representation = 'undefined'
+        self.assertRaises(
+            ValueError, ActorCritic, '', observation_space, action_space)
+
+        mock_parameters.return_value.policy_representation = 'nn'
+        mock_parameters.return_value.value_function_representation = 'undefined'
+        self.assertRaises(
+            ValueError, ActorCritic, '', observation_space, action_space)
+
+    @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters')
+    def test_init_shared_representation(self, mock_parameters):
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.shared_representation = True
+
+        sut = ActorCritic('', observation_space, action_space)
+
+        self.assertEqual(sut._num_actions, 2)
+        self.assertIsNone(sut._num_states)
+        self.assertEqual(sut._shape_of_inputs, (1,))
+        self.assertFalse(sut._discrete_observation_space)
+        self.assertIsNone(sut._space_discretizer)
+        self.assertIsNone(sut._preprocessor)
+
+        self.assertTrue(
+            set(sut._policy_network.parameters).issubset(
+                set(sut._value_network.parameters)))
+        diff = set(sut._value_network.parameters).difference(
+            set(sut._policy_network.parameters))
+        # one for W and one for b
+        self.assertEqual(len(diff), 2)
+
+        shapes = []
+        for item in diff:
+            shapes.append(item.shape)
+        self.assertEqual(set(shapes), {(2, 1), (1,)})
+
+    def test_rollout(self):
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = ActorCritic('', observation_space, action_space)
+
+        sut._choose_action = Mock(side_effect=[(0, ''), (1, ''), (1, '')])
+
+        sut.start(np.array([0.1], np.float32))
+        sut.step(0.1, np.array([0.2], np.float32))
+        sut.step(0.2, np.array([0.3], np.float32))
+
+        self.assertEqual(sut._trajectory_rewards, [0.1, 0.2])
+        self.assertEqual(sut._trajectory_actions, [0, 1, 1])
+        self.assertEqual(sut._trajectory_states, [0.1, 0.2, 0.3])
+
+        sut.end(0.3, np.array([0.4], np.float32))
+
+        self.assertEqual(sut._trajectory_rewards, [0.1, 0.2, 0.3])
+        self.assertEqual(sut._trajectory_actions, [0, 1, 1])
+        self.assertEqual(sut._trajectory_states, [0.1, 0.2, 0.3])
+
+    @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters')
+    def test_rollout_preprocess(self, mock_parameters):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.preprocessing = \
+            'cntk.contrib.deeprl.agent.shared.preprocessing.SlidingWindow'
+        mock_parameters.return_value.preprocessing_args = '(2, "float32")'
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = ActorCritic('', observation_space, action_space)
+
+        sut._choose_action = Mock(side_effect=[(0, ''), (1, ''), (1, '')])
+
+        sut.start(np.array([0.1], np.float32))
+        sut.step(0.1, np.array([0.2], np.float32))
+        sut.step(0.2, np.array([0.3], np.float32))
+
+        self.assertEqual(sut._trajectory_rewards, [0.1, 0.2])
+        self.assertEqual(sut._trajectory_actions, [0, 1, 1])
+        np.testing.assert_array_equal(
+            sut._trajectory_states,
+            [
+                np.array([[0], [0.1]], np.float32),
+                np.array([[0.1], [0.2]], np.float32),
+                np.array([[0.2], [0.3]], np.float32)
+            ])
+
+        sut.end(0.3, np.array([0.4], np.float32))
+
+        self.assertEqual(sut._trajectory_rewards, [0.1, 0.2, 0.3])
+        self.assertEqual(sut._trajectory_actions, [0, 1, 1])
+        np.testing.assert_array_equal(
+            sut._trajectory_states,
+            [
+                np.array([[0], [0.1]], np.float32),
+                np.array([[0.1], [0.2]], np.float32),
+                np.array([[0.2], [0.3]], np.float32)
+            ])
+
+    @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters')
+    def test_rollout_with_update(self, mock_parameters):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.update_frequency = 2
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = ActorCritic('', observation_space, action_space)
+        sut._update_networks = MagicMock()
+
+        sut._choose_action = Mock(side_effect=[
+            (0, ''), (1, ''), (1, ''), (0, ''), (1, ''), (0, '')])
+
+        sut.start(np.array([0.1], np.float32))
+        sut.step(0.1, np.array([0.2], np.float32))
+        self.assertEqual(sut._trajectory_rewards, [0.1])
+        self.assertEqual(sut._trajectory_actions, [0, 1])
+        self.assertEqual(sut._trajectory_states, [0.1, 0.2])
+        self.assertEqual(sut._update_networks.call_count, 0)
+
+        sut.step(0.2, np.array([0.3], np.float32))
+        self.assertEqual(sut._trajectory_rewards, [])
+        self.assertEqual(sut._trajectory_actions, [1])
+        self.assertEqual(sut._trajectory_states, [0.3])
+        self.assertEqual(sut._update_networks.call_count, 1)
+
+        sut.step(0.3, np.array([0.4], np.float32))
+        self.assertEqual(sut._trajectory_rewards, [0.3])
+        self.assertEqual(sut._trajectory_actions, [1, 0])
+        self.assertEqual(sut._trajectory_states, [0.3, 0.4])
+        self.assertEqual(sut._update_networks.call_count, 1)
+
+        sut.start(np.array([0.5], np.float32))
+        self.assertEqual(sut._trajectory_rewards, [])
+        self.assertEqual(sut._trajectory_actions, [1])
+        self.assertEqual(sut._trajectory_states, [0.5])
+        self.assertEqual(sut._update_networks.call_count, 1)
+
+        sut.step(0.4, np.array([0.6], np.float32))
+        self.assertEqual(sut._trajectory_rewards, [])
+        self.assertEqual(sut._trajectory_actions, [0])
+        self.assertEqual(sut._trajectory_states, [0.6])
+        self.assertEqual(sut._update_networks.call_count, 2)
+
+        sut.end(0.5, np.array([0.7], np.float32))
+        self.assertEqual(sut._trajectory_rewards, [0.5])
+        self.assertEqual(sut._trajectory_actions, [0])
+        self.assertEqual(sut._trajectory_states, [0.6])
+        self.assertEqual(sut._update_networks.call_count, 2)
+
+    def test_process_accumulated_trajectory(self):
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = ActorCritic('', observation_space, action_space)
+
+        # Set up.
+        self._setup_trajectory(sut)
+
+        # Call test method.
+        sut._process_accumulated_trajectory(False)
+
+        # Verify results.
+        self.assertEqual(len(sut._trajectory_rewards), 0)
+        self.assertEqual(len(sut._trajectory_actions), 0)
+        self.assertEqual(len(sut._trajectory_states), 0)
+
+        np.testing.assert_array_equal(
+            sut._input_buffer,
+            [np.array([0.1], np.float32), np.array([0.2], np.float32)])
+        # For unknown reason, got [2.9974999999999996] instead of [2.9975] for
+        # the following testcase, therefore use assert_array_almost_equal.
+        np.testing.assert_array_almost_equal(
+            sut._value_network_output_buffer,
+            [
+                [2.9975],    # 3.05 * 0.95 + 0.1
+                [3.05]       # 3 (initial_r) * 0.95 + 0.2
+            ])
+        np.testing.assert_array_equal(
+            sut._policy_network_output_buffer,
+            [
+                np.array([1, 0], np.float32),
+                np.array([0, 1], np.float32)
+            ]
+        )
+        np.testing.assert_array_almost_equal(
+            sut._policy_network_weight_buffer,
+            [
+                [0.9975],    # 2.9975 - 2
+                [2.05]       # 3.05 - 1
+            ])
+
+    def test_process_accumulated_trajectory_keep_last(self):
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = ActorCritic('', observation_space, action_space)
+
+        # Set up.
+        self._setup_trajectory(sut)
+
+        # Call test method.
+        sut._process_accumulated_trajectory(True)
+
+        # Verify results.
+        self.assertEqual(len(sut._trajectory_rewards), 0)
+        self.assertEqual(len(sut._trajectory_actions), 0)
+        self.assertEqual(sut._trajectory_states, [np.array([0.3], np.float32)])
+
+    def test_update_policy_and_value_function(self):
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = ActorCritic('', observation_space, action_space)
+
+        # Set up.
+        self._setup_trajectory(sut)
+        sut._process_accumulated_trajectory(True)
+        sut._trainer = MagicMock()
+        sut._adjust_learning_rate = MagicMock()
+
+        # Call test method.
+        sut._update_networks()
+
+        # Verify value network behavior.
+        self.assertEqual(
+            sut._trainer.train_minibatch.call_count, 1)
+        call_args = sut._trainer.train_minibatch.call_args
+        np.testing.assert_array_equal(
+            call_args[0][0][sut._input_variables],
+            [np.array([0.1], np.float32), np.array([0.2], np.float32)])
+        np.testing.assert_array_almost_equal(
+            call_args[0][0][sut._value_network_output_variables],
+            [[2.9975], [3.05]])
+        np.testing.assert_array_equal(
+            call_args[0][0][sut._policy_network_output_variables],
+            [np.array([1, 0], np.float32), np.array([0, 1], np.float32)])
+        np.testing.assert_array_almost_equal(
+            call_args[0][0][sut._policy_network_weight_variables],
+            [[0.9975], [2.05]])
+
+        # Verify data buffer size.
+        self.assertEqual(len(sut._input_buffer), 0)
+
+    def _setup_parameters(self, params):
+        params.policy_representation = 'nn'
+        params.policy_network_hidden_layers = '[2]'
+        params.value_function_representation = 'nn'
+        params.value_network_hidden_layers = '[2]'
+        params.relative_step_size = 0.5
+        params.regularization_weight = 0.001
+        params.initial_eta = 0.1
+        params.eta_decay_step_count = 10
+        params.eta_minimum = 0.01
+        params.gamma = 0.9
+        params.preprocessing = ''
+        params.preprocessing_args = '()'
+        params.shared_representation = False
+        params.update_frequency = 4
+        params.initial_policy_network = ''
+        params.momentum = 0.95
+
+    def _setup_trajectory(self, sut):
+        # Corresponds to the case where sut.end() is not called.
+        sut._trajectory_rewards = [0.1, 0.2]
+        sut._trajectory_actions = [0, 1]
+        sut._trajectory_states = [
+            np.array([0.1], np.float32),
+            np.array([0.2], np.float32),
+            np.array([0.3], np.float32)]
+        sut._value_network.eval = MagicMock(side_effect=[
+            np.array([[[3]]], np.float32),
+            np.array([[[2]]], np.float32),
+            np.array([[[1]]], np.float32)])
+
+    def _setup_test_model(self, *args, **kwargs):
+        inputs = placeholder(shape=(1,))
+        outputs = input_variable(shape=(1,), dtype=np.float32)
+
+        q = Dense(1, activation=None)(inputs)
+        loss = cross_entropy_with_softmax(q, outputs)
+
+        return {
+            'inputs': inputs,
+            'outputs': outputs,
+            'f': q,
+            'loss': loss
+        }
diff --git a/bindings/python/cntk/contrib/deeprl/tests/preprocessing_test.py b/bindings/python/cntk/contrib/deeprl/tests/preprocessing_test.py
new file mode 100644
index 000000000000..d1ade1d80503
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/tests/preprocessing_test.py
@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import unittest
+
+import numpy as np
+
+from cntk.contrib.deeprl.agent.shared.preprocessing import AtariPreprocessing
+
+
+class AtariPreprocessingTest(unittest.TestCase):
+    """Unit tests for AtariPreprocessing."""
+
+    def test_atari_preprocessing(self):
+        p = AtariPreprocessing((210, 160, 3), 4)
+        self.assertEqual(p._AtariPreprocessing__history_len, 4)
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__previous_raw_image,
+            np.zeros((210, 160, 3), dtype='uint8'))
+        self.assertEqual(len(p._AtariPreprocessing__processed_image_seq), 4)
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__processed_image_seq[0],
+            np.zeros((84, 84), dtype='uint8'))
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__processed_image_seq[-1],
+            np.zeros((84, 84), dtype='uint8'))
+
+        r = p.preprocess(np.ones((210, 160, 3), dtype=np.uint8))
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__previous_raw_image,
+            np.ones((210, 160, 3), dtype=np.uint8))
+        self.assertEqual(len(p._AtariPreprocessing__processed_image_seq), 4)
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__processed_image_seq[0],
+            np.zeros((84, 84), dtype='uint8'))
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__processed_image_seq[-1],
+            np.ones((84, 84), dtype='uint8'))
+        self.assertEqual(r.shape, (4, 84, 84))
+        np.testing.assert_array_equal(
+            np.squeeze(r[3, :, :]),
+            np.ones((84, 84), dtype='uint8'))
+
+        p.preprocess(np.ones((210, 160, 3), dtype=np.uint8) * 2)
+        p.preprocess(np.ones((210, 160, 3), dtype=np.uint8) * 3)
+        r = p.preprocess(np.ones((210, 160, 3), dtype=np.uint8) * 4)
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__previous_raw_image,
+            np.ones((210, 160, 3), dtype='uint8') * 4)
+        self.assertEqual(len(p._AtariPreprocessing__processed_image_seq), 4)
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__processed_image_seq[0],
+            np.ones((84, 84), dtype='uint8'))
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__processed_image_seq[1],
+            np.ones((84, 84), dtype='uint8') * 2)
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__processed_image_seq[2],
+            np.ones((84, 84), dtype='uint8') * 3)
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__processed_image_seq[3],
+            np.ones((84, 84), dtype='uint8') * 4)
+        self.assertEqual(r.shape, (4, 84, 84))
+        np.testing.assert_array_equal(
+            np.squeeze(r[3, :, :]),
+            np.ones((84, 84), dtype='uint8') * 4)
+
+        p.reset()
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__previous_raw_image,
+            np.zeros((210, 160, 3), dtype='uint8'))
+        self.assertEqual(len(p._AtariPreprocessing__processed_image_seq), 4)
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__processed_image_seq[0],
+            np.zeros((84, 84), dtype='uint8'))
+        np.testing.assert_array_equal(
+            p._AtariPreprocessing__processed_image_seq[-1],
+            np.zeros((84, 84), dtype='uint8'))
diff --git a/bindings/python/cntk/contrib/deeprl/tests/qlearning_test.py b/bindings/python/cntk/contrib/deeprl/tests/qlearning_test.py
new file mode 100644
index 000000000000..8c84b331057e
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/tests/qlearning_test.py
@@ -0,0 +1,491 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import unittest
+from unittest.mock import MagicMock, Mock, patch
+
+import cntk.contrib.deeprl.tests.spaces as spaces
+import numpy as np
+from cntk.contrib.deeprl.agent.qlearning import QLearning
+from cntk.contrib.deeprl.agent.shared.cntk_utils import huber_loss
+from cntk.contrib.deeprl.agent.shared.replay_memory import _Transition
+from cntk.layers import Dense
+from cntk.losses import squared_error
+from cntk.ops import input_variable
+
+
+class QLearningTest(unittest.TestCase):
+    """Unit tests for QLearning."""
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory')
+    @patch('cntk.contrib.deeprl.agent.qlearning.Models.feedforward_network')
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_init_dqn(self,
+                      mock_parameters,
+                      mock_model,
+                      mock_replay_memory):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_model.return_value = self._setup_test_model()
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+
+        self.assertEqual(sut._num_actions, 2)
+        self.assertIsNone(sut._num_states)
+        self.assertEqual(sut._shape_of_inputs, (1,))
+        self.assertFalse(sut._discrete_observation_space)
+        self.assertIsNone(sut._space_discretizer)
+        self.assertIsNone(sut._preprocessor)
+        self.assertFalse(hasattr(sut, 'weight_variables'))
+        self.assertIsNotNone(sut._trainer)
+        mock_model.assert_called_with((1,), 2, '[2]', None)
+        mock_replay_memory.assert_called_with(100, False)
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory')
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_init_dqn_prioritized_replay(self,
+                                         mock_parameters,
+                                         mock_replay_memory):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.use_prioritized_replay = True
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+
+        self.assertIsNotNone(sut._weight_variables)
+        mock_replay_memory.assert_called_with(100, True)
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory')
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_init_dqn_preprocessing(self,
+                                    mock_parameters,
+                                    mock_replay_memory):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.preprocessing = \
+            'cntk.contrib.deeprl.agent.shared.preprocessing.AtariPreprocessing'
+        mock_parameters.return_value.preprocessing_args = '()'
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+
+        # Preprocessor with default arguments.
+        self.assertIsNotNone(sut._preprocessor)
+        self.assertEqual(sut._preprocessor.output_shape(), (4, 84, 84))
+
+        # Preprocessor with arguments passed as a tuple.
+        mock_parameters.return_value.preprocessing_args = '(3,)'
+        sut = QLearning('', observation_space, action_space)
+        self.assertEqual(sut._preprocessor.output_shape(), (3, 84, 84))
+
+        # Preprocessor with inappropriate arguments.
+        mock_parameters.return_value.preprocessing_args = '(3, 4)'
+        self.assertRaises(
+            TypeError, QLearning, '', observation_space, action_space)
+
+        # Undefined preprocessor.
+        mock_parameters.return_value.preprocessing = 'undefined'
+        self.assertRaises(
+            ValueError, QLearning, '', observation_space, action_space)
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.Models.dueling_network')
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_init_dueling_dqn(self, mock_parameters, mock_model):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.q_representation = 'dueling-dqn'
+        mock_parameters.return_value.hidden_layers = '[2, [2], [2]]'
+        mock_model.return_value = self._setup_test_model()
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+
+        self.assertEqual(sut._num_actions, 2)
+        self.assertIsNone(sut._num_states)
+        self.assertEqual(sut._shape_of_inputs, (1,))
+        self.assertFalse(sut._discrete_observation_space)
+        self.assertIsNone(sut._space_discretizer)
+        self.assertIsNone(sut._preprocessor)
+        mock_model.assert_called_with((1,), 2, '[2, [2], [2]]', None)
+
+    @patch('cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn')
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_init_customized_q(self, mock_parameters, mock_model):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.q_representation = \
+            'cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn'
+        mock_model.return_value = self._setup_test_model()
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+
+        self.assertEqual(sut._num_actions, 2)
+        self.assertIsNone(sut._num_states)
+        self.assertEqual(sut._shape_of_inputs, (1,))
+        self.assertFalse(sut._discrete_observation_space)
+        self.assertIsNone(sut._space_discretizer)
+        self.assertIsNone(sut._preprocessor)
+        mock_model.assert_called_with((1,), 2, None)
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_init_unsupported_q(self, mock_parameters):
+        instance = mock_parameters.return_value
+        instance.q_representation = 'undefined'
+        instance.preprocessing = ''
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        self.assertRaises(
+            ValueError, QLearning, '', observation_space, action_space)
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.Models.feedforward_network')
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_init_dqn_huber_loss(self, mock_parameters, mock_model):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.use_error_clipping = True
+        mock_model.return_value = self._setup_test_model()
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+
+        mock_model.assert_called_with((1,), 2, '[2]', huber_loss)
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory')
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_update_q(self,
+                      mock_parameters,
+                      mock_replay_memory):
+        """Test if _update_q_periodically() can finish successfully."""
+        self._setup_parameters(mock_parameters.return_value)
+        self._setup_replay_memory(mock_replay_memory.return_value)
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+        sut._trainer.train_minibatch = MagicMock()
+        sut._choose_action = MagicMock(side_effect=[
+            (1, 'GREEDY'),
+            (0, 'GREEDY'),
+            (1, 'RANDOM'),
+        ])
+
+        action, debug_info = sut.start(np.array([0.1], np.float32))
+        self.assertEqual(action, 1)
+        self.assertEqual(debug_info['action_behavior'], 'GREEDY')
+        self.assertEqual(sut.episode_count, 1)
+        self.assertEqual(sut.step_count, 0)
+        self.assertEqual(sut._epsilon, 0.1)
+        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.1)
+        self.assertEqual(sut._last_state, np.array([0.1], np.float32))
+        self.assertEqual(sut._last_action, 1)
+
+        action, debug_info = sut.step(1, np.array([0.2], np.float32))
+        self.assertEqual(action, 0)
+        self.assertEqual(debug_info['action_behavior'], 'GREEDY')
+        self.assertEqual(sut.episode_count, 1)
+        self.assertEqual(sut.step_count, 1)
+        self.assertEqual(sut._epsilon, 0.09)
+        # learning rate remains 0.1 as Q is not updated during this time step.
+        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.1)
+        self.assertEqual(sut._last_state, np.array([0.2], np.float32))
+        self.assertEqual(sut._last_action, 0)
+
+        action, debug_info = sut.step(2, np.array([0.3], np.float32))
+        self.assertEqual(action, 1)
+        self.assertEqual(debug_info['action_behavior'], 'RANDOM')
+        self.assertEqual(sut.episode_count, 1)
+        self.assertEqual(sut.step_count, 2)
+        self.assertEqual(sut._epsilon, 0.08)
+        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.08)
+        self.assertEqual(sut._last_state, np.array([0.3], np.float32))
+        self.assertEqual(sut._last_action, 1)
+
+        sut.end(3, np.array([0.4], np.float32))
+        self.assertEqual(sut.episode_count, 1)
+        self.assertEqual(sut.step_count, 3)
+        self.assertEqual(sut._epsilon, 0.08)
+        # learning rate remains 0.08 as Q is not updated during this time step.
+        self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.08)
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory')
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_update_q_dqn(self,
+                          mock_parameters,
+                          mock_replay_memory):
+        self._setup_parameters(mock_parameters.return_value)
+        self._setup_replay_memory(mock_replay_memory.return_value)
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+
+        sut._q.eval = \
+            MagicMock(return_value=np.array([[[0.2, 0.1]]], np.float32))
+        sut._target_q.eval = \
+            MagicMock(return_value=np.array([[[0.3, 0.4]]], np.float32))
+        sut._trainer = MagicMock()
+
+        sut._update_q_periodically()
+
+        np.testing.assert_array_equal(
+            sut._trainer.train_minibatch.call_args[0][0][sut._input_variables],
+            [np.array([0.1], np.float32)])
+        # 10 (reward) + 0.9 (gamma) x 0.4 (max q_target) -> update action 0
+        np.testing.assert_array_equal(
+            sut._trainer.train_minibatch.call_args[0][0][sut._output_variables],
+            [np.array([10.36, 0.1], np.float32)])
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory')
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_update_q_dqn_prioritized_replay(self,
+                                             mock_parameters,
+                                             mock_replay_memory):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.use_prioritized_replay = True
+        self._setup_prioritized_replay_memory(mock_replay_memory.return_value)
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+
+        def new_q_value(self):
+            return np.array([[[0.2, 0.1]]], np.float32)
+        sut._q.eval = MagicMock(side_effect=new_q_value)
+        sut._target_q.eval = MagicMock(
+            return_value=np.array([[[0.3, 0.4]]], np.float32))
+        sut._trainer = MagicMock()
+
+        sut._update_q_periodically()
+
+        self.assertEqual(sut._trainer.train_minibatch.call_count, 1)
+        np.testing.assert_array_equal(
+            sut._trainer.train_minibatch.call_args[0][0][sut._input_variables],
+            [
+                np.array([0.1], np.float32),
+                np.array([0.3], np.float32),
+                np.array([0.1], np.float32)
+            ])
+        np.testing.assert_array_equal(
+            sut._trainer.train_minibatch.call_args[0][0][sut._output_variables],
+            [
+                # 10 (reward) + 0.9 (gamma) x 0.4 (max q_target)
+                np.array([10.36, 0.1], np.float32),
+                # 11 (reward) + 0.9 (gamma) x 0.4 (max q_target)
+                np.array([0.2, 11.36], np.float32),
+                np.array([10.36, 0.1], np.float32)
+            ])
+        np.testing.assert_almost_equal(
+            sut._trainer.train_minibatch.call_args[0][0][sut._weight_variables],
+            [
+                [0.16666667],
+                [0.66666667],
+                [0.16666667]
+            ])
+        self.assertAlmostEqual(
+            sut._replay_memory.update_priority.call_args[0][0][3],
+            105.2676)  # (10.16 + 0.1)^2
+        self.assertAlmostEqual(
+            sut._replay_memory.update_priority.call_args[0][0][4],
+            129.0496,
+            places=6)  # (11.26 + 0.1) ^ 2
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory')
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_update_q_double_dqn(self,
+                                 mock_parameters,
+                                 mock_replay_memory):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.double_q_learning = True
+        self._setup_replay_memory(mock_replay_memory.return_value)
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+
+        sut._q.eval = \
+            MagicMock(return_value=np.array([[[0.2, 0.1]]], np.float32))
+        sut._target_q.eval = \
+            MagicMock(return_value=np.array([[[0.3, 0.4]]], np.float32))
+        sut._trainer = MagicMock()
+
+        sut._update_q_periodically()
+
+        # 10 (reward) + 0.9 (gamma) x 0.3 -> update action 0
+        np.testing.assert_array_equal(
+            sut._trainer.train_minibatch.call_args[0][0][sut._output_variables],
+            [np.array([10.27, 0.1], np.float32)])
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_populate_replay_memory(self, mock_parameters):
+        self._setup_parameters(mock_parameters.return_value)
+        mock_parameters.return_value.preprocessing = \
+            'cntk.contrib.deeprl.agent.shared.preprocessing.SlidingWindow'
+        mock_parameters.return_value.preprocessing_args = '(2, )'
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+
+        sut._compute_priority = Mock(side_effect=[1, 2, 3])
+        sut._choose_action = Mock(
+            side_effect=[(0, ''), (0, ''), (1, ''), (1, '')])
+        sut._replay_memory = MagicMock()
+        sut._update_q_periodically = MagicMock()
+
+        sut.start(np.array([0.1], np.float32))
+        sut.step(0.1, np.array([0.2], np.float32))
+        sut.step(0.2, np.array([0.3], np.float32))
+        sut.end(0.3, np.array([0.4], np.float32))
+
+        self.assertEqual(sut._replay_memory.store.call_count, 3)
+
+        call_args = sut._replay_memory.store.call_args_list[0]
+        np.testing.assert_array_equal(
+            call_args[0][0],
+            np.array([[0], [0.1]], np.float32))
+        self.assertEqual(call_args[0][1], 0)
+        self.assertEqual(call_args[0][2], 0.1)
+        np.testing.assert_array_equal(
+            call_args[0][3],
+            np.array([[0.1], [0.2]], np.float32))
+        self.assertEqual(call_args[0][4], 1)
+
+        call_args = sut._replay_memory.store.call_args_list[2]
+        np.testing.assert_array_equal(
+            call_args[0][0],
+            np.array([[0.2], [0.3]], np.float32))
+        self.assertEqual(call_args[0][1], 1)
+        self.assertEqual(call_args[0][2], 0.3)
+        self.assertIsNone(call_args[0][3])
+        self.assertEqual(call_args[0][4], 3)
+
+    @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters')
+    def test_replay_start_size(self, mock_parameters):
+        self._setup_parameters(mock_parameters.return_value)
+        # Set exploration rate to 0
+        mock_parameters.return_value.initial_epsilon = 0
+        mock_parameters.return_value.epsilon_decay_step_count = 100
+        mock_parameters.return_value.epsilon_minimum = 0
+        mock_parameters.return_value.replay_start_size = 3
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Box(0, 1, (1,))
+        sut = QLearning('', observation_space, action_space)
+        sut._trainer = MagicMock()
+        sut._replay_memory = MagicMock()
+
+        _, debug = sut.start(np.array([0.1], np.float32))
+        self.assertEqual(sut.step_count, 0)
+        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)
+        self.assertEqual(debug['action_behavior'], 'RANDOM')
+
+        _, debug = sut.step(0.1, np.array([0.2], np.float32))
+        self.assertEqual(sut.step_count, 1)
+        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)
+        self.assertEqual(debug['action_behavior'], 'RANDOM')
+
+        sut.end(0.2, np.array([0.3], np.float32))
+        self.assertEqual(sut.step_count, 2)
+        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)
+
+        _, debug = sut.start(np.array([0.4], np.float32))
+        self.assertEqual(sut.step_count, 2)
+        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)
+        self.assertEqual(debug['action_behavior'], 'RANDOM')
+
+        a, debug = sut.step(0.3, np.array([0.5], np.float32))
+        self.assertEqual(sut.step_count, 3)
+        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)
+        self.assertEqual(debug['action_behavior'], 'GREEDY')
+
+        a, debug = sut.start(np.array([0.6], np.float32))
+        self.assertEqual(sut.step_count, 3)
+        self.assertEqual(sut._trainer.train_minibatch.call_count, 0)
+        self.assertEqual(debug['action_behavior'], 'GREEDY')
+
+        a, debug = sut.step(0.4, np.array([0.7], np.float32))
+        self.assertEqual(sut.step_count, 4)
+        self.assertEqual(sut._trainer.train_minibatch.call_count, 1)
+        self.assertEqual(debug['action_behavior'], 'GREEDY')
+
+    def _setup_parameters(self, parameters):
+        parameters.q_representation = 'dqn'
+        parameters.hidden_layers = '[2]'
+        parameters.initial_epsilon = 0.1
+        parameters.epsilon_decay_step_count = 9
+        parameters.epsilon_minimum = 0.01
+        parameters.initial_eta = 0.1
+        parameters.eta_decay_step_count = 9
+        parameters.eta_minimum = 0.01
+        parameters.momentum = 0.95
+        parameters.gradient_clipping_threshold = 10
+        parameters.q_update_frequency = 2
+        parameters.gamma = 0.9
+        parameters.double_q_learning = False
+        parameters.replay_start_size = 0
+        parameters.replay_memory_capacity = 100
+        parameters.use_prioritized_replay = False
+        parameters.priority_alpha = 2
+        parameters.priority_beta = 2
+        parameters.priority_epsilon = 0.1
+        parameters.preprocessing = ''
+        parameters.use_error_clipping = False
+        parameters.replays_per_update = 1
+
+    def _setup_replay_memory(self, replay_memory):
+        replay_memory.sample_minibatch.side_effect = \
+            [[(0, _Transition(
+                np.array([0.1], np.float32),
+                0,
+                10,
+                np.array([0.2], np.float32),
+                0.01))],
+             [(1, _Transition(
+                np.array([0.3], np.float32),
+                1,
+                -10,
+                np.array([0.4], np.float32),
+                0.02))]]
+
+    def _setup_prioritized_replay_memory(self, replay_memory):
+        # Duplicated values can be returned.
+        replay_memory.sample_minibatch.return_value = \
+            [(3, _Transition(
+                np.array([0.1], np.float32),
+                0,
+                10,
+                np.array([0.2], np.float32),
+                2)),
+             (4, _Transition(
+                np.array([0.3], np.float32),
+                1,
+                11,
+                np.array([0.4], np.float32),
+                1)),
+             (3, _Transition(
+                np.array([0.1], np.float32),
+                0,
+                10,
+                np.array([0.2], np.float32),
+                2))]
+
+    def _setup_test_model(self):
+        inputs = input_variable(shape=(1,), dtype=np.float32)
+        outputs = input_variable(shape=(1,), dtype=np.float32)
+
+        q = Dense(1, activation=None)(inputs)
+        loss = squared_error(q, outputs)
+
+        return {
+            'inputs': inputs,
+            'outputs': outputs,
+            'f': q,
+            'loss': loss
+        }
diff --git a/bindings/python/cntk/contrib/deeprl/tests/replay_memory_test.py b/bindings/python/cntk/contrib/deeprl/tests/replay_memory_test.py
new file mode 100644
index 000000000000..f6a5912e5089
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/tests/replay_memory_test.py
@@ -0,0 +1,71 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import unittest
+
+from cntk.contrib.deeprl.agent.shared.replay_memory import ReplayMemory
+
+
+class ReplayMemoryTest(unittest.TestCase):
+    """Unit tests for ReplayMemory."""
+
+    def test_uniform_sampling(self):
+        sut = ReplayMemory(3)
+        self.assertEqual(sut.sample_minibatch(1), [])
+
+        sut.store(1, 'ignore', 'ignore', 'ignore', 0)
+        self.assertEqual(sut.size(), 1)
+        self.assertEqual([s[0] for s in sut.sample_minibatch(1)], [0])
+        self.assertEqual([s[0] for s in sut.sample_minibatch(2)], [0])
+
+        sut.store(2, 'ignore', 'ignore', 'ignore', 0)
+        sut.store(3, 'ignore', 'ignore', 'ignore', 0)
+        self.assertEqual(sut.size(), 3)
+        samples = sut.sample_minibatch(1)
+        self.assertEqual(len(samples), 1)
+        self.assertTrue(set(s[0] for s in samples).issubset([0, 1, 2]))
+        self.assertTrue(set(s[1].state for s in samples).issubset([1, 2, 3]))
+
+        sut.store(4, 'ignore', 'ignore', 'ignore', 0)
+        self.assertEqual(sut.size(), 3)
+        samples = sut.sample_minibatch(1)
+        self.assertEqual(len(samples), 1)
+        self.assertTrue(set(s[0] for s in samples).issubset([0, 1, 2]))
+        self.assertTrue(set(s[1].state for s in samples).issubset([2, 3, 4]))
+
+    def test_prioritized_sampling(self):
+        sut = ReplayMemory(3, True)
+        self.assertEqual(sut.sample_minibatch(1), [])
+
+        sut.store(1, 'ignore', 'ignore', 'ignore', 1)
+        self.assertEqual(sut.size(), 1)
+        self.assertEqual([s[0] for s in sut.sample_minibatch(1)], [2])
+        self.assertEqual([s[0] for s in sut.sample_minibatch(2)], [2, 2])
+
+        sut.store(2, 'ignore', 'ignore', 'ignore', 3)
+        sut.store(3, 'ignore', 'ignore', 'ignore', 2)
+        self.assertEqual(sut.size(), 3)
+        self.assertEqual(len(sut._memory), 5)
+        self.assertEqual(sut._memory[:2], [6, 5])
+
+        samples = sut.sample_minibatch(2)
+        self.assertEqual(len(samples), 2)
+        self.assertEqual(samples[0][0], 3)
+        self.assertEqual(samples[0][1].state, 2)
+
+        sut.store(4, 'ignore', 'ignore', 'ignore', 5)
+        self.assertEqual(sut.size(), 3)
+        self.assertEqual(sut._memory[:2], [10, 5])
+
+        samples = sut.sample_minibatch(2)
+        self.assertEqual(len(samples), 2)
+        self.assertIn(samples[0][0], [3, 4])
+        self.assertIn(samples[0][1].state, [2, 3])
+        self.assertEqual(samples[1][0], 2)
+        self.assertEqual(samples[1][1].state, 4)
+
+        sut.update_priority({3: 4, 4: 0.5})
+        self.assertEqual(sut._memory[:2], [9.5, 4.5])
diff --git a/bindings/python/cntk/contrib/deeprl/tests/spaces.py b/bindings/python/cntk/contrib/deeprl/tests/spaces.py
new file mode 100644
index 000000000000..6da5d2acbf22
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/tests/spaces.py
@@ -0,0 +1,51 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import numpy as np
+
+
+class Box:
+    """Fake gym.spaces.box.Box to remove dependency on OpenAI gym."""
+
+    def __init__(self, low, high, shape=None):
+        if shape is None:
+            assert low.shape == high.shape
+            self.low = low
+            self.high = high
+        else:
+            assert np.isscalar(low) and np.isscalar(high)
+            self.low = low + np.zeros(shape)
+            self.high = high + np.zeros(shape)
+
+        self.__class__.__module__ = 'gym.spaces.box'
+
+    @property
+    def shape(self):
+        return self.low.shape
+
+
+class Discrete:
+    """Fake gym.spaces.discrete.Discrete to remove dependency on OpenAI gym."""
+
+    def __init__(self, n):
+        self.n = n
+        self.__class__.__module__ = 'gym.spaces.discrete'
+
+
+class Tuple:
+    """Fake gym.spaces.tuple_space.Tuple to remove dependency on OpenAI gym."""
+
+    def __init__(self, spaces):
+        self.spaces = spaces
+        self.__class__.__module__ = 'gym.spaces.tuple_space'
+
+
+class MultiBinary:
+    """Fake gym.spaces.multi_binary.MultiBinary to remove dependency on OpenAI gym."""
+
+    def __init__(self, n):
+        self.n = n
+        self.__class__.__module__ = 'gym.spaces.multi_binary'
diff --git a/bindings/python/cntk/contrib/deeprl/tests/tabular_qlearning_test.py b/bindings/python/cntk/contrib/deeprl/tests/tabular_qlearning_test.py
new file mode 100644
index 000000000000..9bc5c0240487
--- /dev/null
+++ b/bindings/python/cntk/contrib/deeprl/tests/tabular_qlearning_test.py
@@ -0,0 +1,133 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import unittest
+from unittest.mock import patch
+
+import cntk.contrib.deeprl.tests.spaces as spaces
+import numpy as np
+from cntk.contrib.deeprl.agent.tabular_qlearning import TabularQLearning
+
+
+class FakeTabularQLearning(TabularQLearning):
+    """Override TabularQLearning for unittest."""
+
+    def _choose_action(self, state):
+        """Fake epsilon greedy policy."""
+        return state % 2, 'GREEDY'
+
+
+class TabularQLearningTest(unittest.TestCase):
+    """Unit tests for TabularQLearning."""
+
+    def test_init(self):
+        # Discrete observation space.
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Discrete(3)
+        sut = TabularQLearning('', observation_space, action_space)
+        self.assertEqual(sut._num_actions, 2)
+        self.assertEqual(sut._num_states, 3)
+        self.assertEqual(sut._shape_of_inputs, (3, ))
+        self.assertTrue(sut._discrete_observation_space)
+        self.assertIsNone(sut._space_discretizer)
+        self.assertIsNone(sut._preprocessor)
+
+        # Discretize observation space to default resolution.
+        observation_space = spaces.Box(0, 1, (2,))
+        sut = TabularQLearning('', observation_space, action_space)
+        self.assertEqual(sut._num_states, 100)
+        self.assertEqual(sut._shape_of_inputs, (100, ))
+        self.assertTrue(sut._discrete_observation_space)
+        self.assertIsNotNone(sut._space_discretizer)
+        # Verify encoding of state
+        self.assertEqual(sut._discretize_state_if_necessary([0, 0]), 0)
+        self.assertEqual(sut._discretize_state_if_necessary([0.05, 0]), 0)
+        self.assertEqual(sut._discretize_state_if_necessary([0.95, 0]), 90)
+        self.assertEqual(sut._discretize_state_if_necessary([0, 0.05]), 0)
+        self.assertEqual(sut._discretize_state_if_necessary([0, 0.95]), 9)
+        self.assertEqual(sut._discretize_state_if_necessary([0.1, 0.2]), 12)
+        self.assertEqual(sut._discretize_state_if_necessary([1, 1]), 99)
+
+        # Unsupported observation space for tabular representation
+        observation_space = spaces.MultiBinary(10)
+        self.assertRaises(
+            ValueError, TabularQLearning, '', observation_space, action_space)
+
+    @patch('cntk.contrib.deeprl.agent.tabular_qlearning.QLearningParameters')
+    def test_init_unsupported_q(self, mock_qlearn_parameters):
+        mock_qlearn_parameters.return_value.q_representation = 'undefined'
+
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Discrete(3)
+        self.assertRaises(
+            ValueError, TabularQLearning, '', observation_space, action_space)
+
+    @patch('cntk.contrib.deeprl.agent.tabular_qlearning.QLearningParameters')
+    def test_update(self, mock_qlearn_parameters):
+        self._setup_qlearn_parameters(mock_qlearn_parameters.return_value)
+        action_space = spaces.Discrete(2)
+        observation_space = spaces.Discrete(3)
+        sut = FakeTabularQLearning('', observation_space, action_space)
+
+        sut.start(0)
+        self.assertEqual(sut.episode_count, 1)
+        self.assertEqual(sut.step_count, 0)
+        self.assertEqual(sut._epsilon, 0.1)
+        # _eta has not been defined so far.
+        self.assertEqual(sut._last_state, 0)
+        self.assertEqual(sut._last_action, 0)
+
+        sut.step(1, 1)
+        self.assertEqual(sut.episode_count, 1)
+        self.assertEqual(sut.step_count, 1)
+        self.assertEqual(sut._epsilon, 0.09)
+        self.assertEqual(sut._eta, 0.1)
+        self.assertEqual(sut._last_state, 1)
+        self.assertEqual(sut._last_action, 1)
+        np.testing.assert_array_equal(
+            sut._q, [[0.1, 0], [0, 0], [0, 0]])
+
+        sut.step(1, 1)
+        self.assertEqual(sut.episode_count, 1)
+        self.assertEqual(sut.step_count, 2)
+        self.assertEqual(sut._epsilon, 0.08)
+        self.assertEqual(sut._eta, 0.09)
+        self.assertEqual(sut._last_state, 1)
+        self.assertEqual(sut._last_action, 1)
+        np.testing.assert_array_equal(
+            sut._q, [[0.1, 0], [0, 0.09], [0, 0]])
+
+        sut.step(1, 1)
+        self.assertEqual(sut.episode_count, 1)
+        self.assertEqual(sut.step_count, 3)
+        self.assertEqual(sut._epsilon, 0.07)
+        self.assertEqual(sut._eta, 0.08)
+        self.assertEqual(sut._last_state, 1)
+        self.assertEqual(sut._last_action, 1)
+        # 0.16928 = 0.09 + (1(reward) + 0.9(gamma)*max([0, 0.09]) - 0.09) * 0.08(eta)
+        np.testing.assert_almost_equal(
+            sut._q, [[0.1, 0], [0, 0.16928], [0, 0]])
+
+        sut.end(1, 2)
+        self.assertEqual(sut.episode_count, 1)
+        self.assertEqual(sut.step_count, 4)
+        # _epsilon remains the same as no action is chosen in end().
+        self.assertEqual(sut._epsilon, 0.07)
+        self.assertEqual(sut._eta, 0.07)
+        # 0.2274304 = 0.16928 + (1(reward) - 0.16928) * 0.07(eta)
+        np.testing.assert_almost_equal(
+            sut._q, [[0.1, 0], [0, 0.2274304], [0, 0]])
+
+    def _setup_qlearn_parameters(self, qlearn_parameters):
+        qlearn_parameters.q_representation = 'tabular'
+        qlearn_parameters.initial_q = 0
+        qlearn_parameters.initial_epsilon = 0.1
+        qlearn_parameters.epsilon_decay_step_count = 9
+        qlearn_parameters.epsilon_minimum = 0.01
+        qlearn_parameters.initial_eta = 0.1
+        qlearn_parameters.eta_decay_step_count = 9
+        qlearn_parameters.eta_minimum = 0.01
+        qlearn_parameters.gamma = 0.9
diff --git a/bindings/python/cntk/ops/__init__.py b/bindings/python/cntk/ops/__init__.py
index 6d712eddb523..545ffafa5e0c 100755
--- a/bindings/python/cntk/ops/__init__.py
+++ b/bindings/python/cntk/ops/__init__.py
@@ -1471,6 +1471,50 @@ def cos(x, name=''):
     x = sanitize_input(x)
     return cos(x, name)
 
+@typemap
+def sinh(x, name=''):
+    '''
+    Computes the element-wise sinh of ``x``:
+
+    The output tensor has the same shape as ``x``.
+
+    Example:
+        >>> np.round(C.sinh([[1,0.5],[-0.25,-0.75]]).eval(),5)
+        array([[ 1.1752 ,  0.5211 ],
+               [-0.25261, -0.82232]], dtype=float32)
+
+    Args:
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
+    Returns:
+        :class:`~cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import sinh
+    x = sanitize_input(x)
+    return sinh(x, name)
+
+@typemap
+def cosh(x, name=''):
+    '''
+    Computes the element-wise cosh of ``x``:
+
+    The output tensor has the same shape as ``x``.
+
+    Example:
+        >>> np.round(C.cosh([[1,0.5],[-0.25,-0.75]]).eval(),5)
+        array([[ 1.54308,  1.12763],
+               [ 1.03141,  1.29468]], dtype=float32)
+
+    Args:
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
+    Returns:
+        :class:`~cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import cosh
+    x = sanitize_input(x)
+    return cosh(x, name)
+
 
 @typemap
 def softmax(x, axis=None, name=''):