diff --git a/Examples/ReinforcementLearning/deeprl/README.md b/Examples/ReinforcementLearning/deeprl/README.md new file mode 100644 index 000000000000..74ec516f7c49 --- /dev/null +++ b/Examples/ReinforcementLearning/deeprl/README.md @@ -0,0 +1,41 @@ +Examples of running CNTK DeepRL toolkit. + +Dependency: + - OpenAI Gym: https://gym.openai.com/docs + - Atari: https://github.com/openai/gym#atari + Use the following command to install Atari games on Windows: + pip install git+https://github.com/Kojoley/atari-py.git + +The following commands assume Examples/ReinforcementLearning/deeprl/scripts as the working directory. + +To train an agent using + - TabularQLearning + python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/tabular_qlearning.config --eval_period=1000 --eval_steps=20000 + + - QLearning + python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/qlearning.config --eval_period=1000 --eval_steps=20000 + + - ActorCritic + python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/policy_gradient.config --eval_period=1000 --eval_steps=20000 + + - RandomAgent + python run.py --env=CartPole-v0 --max_steps=100 --eval_period=1 --eval_steps=200000 + +Use QLearning as an example, the command + python run.py --env=CartPole-v0 --max_steps=100000 --agent_config=config_examples/qlearning.config --eval_period=1000 --eval_steps=20000 +tells QLearning agent to interact with environment CartPole-v0 for a maximum of +100000 steps, while evaluation is done every 1000 steps. Each evaluation reports +average reward per episode by interacting with the environment 20000 steps. + +The agent configs, best model and evaluation results are written to --output_dir, +which defaults to 'output' in the working directory. To view the evaluation +results, type the following command in python: + +import shelve +d = shelve.open('output/output.wks') +d['reward_history'] +d.close() + +Note, reading and writing wks simultaneously will corrupt the file. To +check your results while the program is still running, make a copy of wks file +and read the numbers from the copy. diff --git a/Examples/ReinforcementLearning/deeprl/env/__init__.py b/Examples/ReinforcementLearning/deeprl/env/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/Examples/ReinforcementLearning/deeprl/env/env_factory.py b/Examples/ReinforcementLearning/deeprl/env/env_factory.py new file mode 100644 index 000000000000..f1040870685c --- /dev/null +++ b/Examples/ReinforcementLearning/deeprl/env/env_factory.py @@ -0,0 +1,29 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +from gym import envs + +from . import maze2d, puddleworld + + +def register_env(env_id): + if env_id == 'Maze2D-v0': + envs.register( + id=env_id, + entry_point='env:maze2d.Maze2D', + kwargs={}, + max_episode_steps=200, + reward_threshold=-110.0) + elif env_id == 'PuddleWorld-v0': + envs.register( + id=env_id, + entry_point='env:puddleworld.PuddleWorld', + kwargs={}, + max_episode_steps=200, + reward_threshold=-100.0) + else: + raise ValueError('Cannot find environment "{0}"\n'.format(env_id)) + return True diff --git a/Examples/ReinforcementLearning/deeprl/env/maze2d.py b/Examples/ReinforcementLearning/deeprl/env/maze2d.py new file mode 100644 index 000000000000..e9a957d65dca --- /dev/null +++ b/Examples/ReinforcementLearning/deeprl/env/maze2d.py @@ -0,0 +1,95 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import gym +import numpy as np +from gym import spaces +from gym.utils import seeding + + +class Maze2D(gym.Env): + """This class creates a maze problem given a map.""" + + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 30 + } + + def __init__(self): + self._load_map() + self.viewer = None + self.action_space = spaces.Discrete(4) + self.observation_space = spaces.Discrete(self.room_lengths[0] * + self.room_lengths[1]) + self._seed() + self._reset() + + def _seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def _step(self, action): + assert self.action_space.contains(action), "%r (%s) invalid" % ( + action, type(action)) + + if (np.random.uniform(0, 1) > self.motion_noise): + state0 = self.state[0] + state1 = self.state[1] + if action == 0: # north + state1 = np.minimum(self.room_lengths[1] - 1, state1 + 1) + elif action == 1: # east + state0 = np.minimum(self.room_lengths[0] - 1, state0 + 1) + elif action == 2: # south + state1 = np.maximum(0, state1 - 1) + else: # west + state0 = np.maximum(0, state0 - 1) + if not ([state0, state1] in self.wall_states): + self.state[0] = state0 + self.state[1] = state1 + + done = self._is_goal(self.state) + reward = -1.0 + return self._encode_state(self.state), reward, done, {} + + def _reset(self): + rnd_index = np.random.randint(0, len(self.initial_states)) + self.state = self.initial_states[rnd_index][:] + return self._encode_state(self.state) + + def _load_map(self): + self.room_lengths = np.array([25, 25]) + self.initial_states = [[0, 0]] + self.goal_states = [[24, 24]] + self.wall_states = [] + self._build_wall([2, 0], [2, 15]) + self._build_wall([5, 10], [5, 20]) + self._build_wall([5, 12], [13, 12]) + self._build_wall([15, 5], [15, 24]) + self._build_wall([10, 5], [22, 5]) + self.num_states = self.room_lengths[0] * self.room_lengths[1] + self.motion_noise = 0.05 + + def _is_goal(self, state): + return self.state in self.goal_states + + def _encode_state(self, state): + return int(state[1] * self.room_lengths[0] + state[0]) + + def _build_wall(self, start, end): + x_min = np.maximum(0, np.minimum(start[0], end[0])) + x_max = np.minimum(self.room_lengths[0] - 1, + np.maximum(start[0], end[0])) + y_min = np.maximum(0, np.minimum(start[1], end[1])) + y_max = np.minimum(self.room_lengths[1] - 1, + np.maximum(start[1], end[1])) + for x in range(x_min, x_max + 1): + for y in range(y_min, y_max + 1): + if not ([x, y] in self.goal_states or + [x, y] in self.initial_states): + self.wall_states.append([x, y]) + + def _render(self, mode='human', close=False): + pass diff --git a/Examples/ReinforcementLearning/deeprl/env/puddleworld.py b/Examples/ReinforcementLearning/deeprl/env/puddleworld.py new file mode 100644 index 000000000000..18b8eb89f155 --- /dev/null +++ b/Examples/ReinforcementLearning/deeprl/env/puddleworld.py @@ -0,0 +1,102 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import gym +import numpy as np +from gym import spaces +from gym.utils import seeding + + +class PuddleWorld(gym.Env): + """This class creates a continous-state maze problem given a map.""" + + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 30 + } + + def __init__(self): + self._load_map() + self.viewer = None + self.action_space = spaces.Discrete(4) + self.observation_space = spaces.Box(np.zeros(2), self.room_lengths) + self._seed() + self._reset() + + def _seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def _step(self, action): + assert self.action_space.contains(action), "%r (%s) invalid" % ( + action, type(action)) + + if (np.random.uniform(0., 1.) > self.motion_noise): + state0 = self.state[0] + state1 = self.state[1] + # Motion length is a truncated normal random variable. + motion_length = np.maximum( + 0., + np.minimum( + self.motion_max, + np.random.normal(self.motion_mean, self.motion_std))) + if action == 0: # north + state1 = np.minimum(self.room_lengths[1], + state1 + motion_length) + elif action == 1: # east + state0 = np.minimum(self.room_lengths[0], + state0 + motion_length) + elif action == 2: # south + state1 = np.maximum(0., state1 - motion_length) + else: # west + state0 = np.maximum(0., state0 - motion_length) + self.state[0] = state0 + self.state[1] = state1 + + done = self._is_goal(self.state) + reward = self._compute_reward(self.state) + return self.state, reward, done, {} + + def _reset(self): + self.state = np.copy(self.initial_state) + return self.state + + def _load_map(self): + self.room_lengths = np.array([1., 1.]) + self.initial_state = np.array([0., 0.]) + self.goal_state = np.array([1., 1.]) + self.goal_width = 0.01 + self.motion_noise = 0.05 # probability of no-motion (staying in same state) + self.motion_mean = 0.1 # mean of motion length + self.motion_std = 0.1 * self.motion_mean # std of motion length + self.motion_max = 2.0 * self.motion_mean + self.puddle_centers = [] + self.puddle_radii = [] + self._build_puddle(np.array([0.2, 0.4]), 0.1) + self._build_puddle(np.array([0.5, 0.8]), 0.1) + self._build_puddle(np.array([0.9, 0.1]), 0.1) + self.num_puddles = len(self.puddle_centers) + self.puddle_cost = 2.0 + + def _compute_reward(self, state): + reward = -1 + for i in range(self.num_puddles): + delta = state - self.puddle_centers[i] + dist = np.dot(delta, delta) + if dist <= self.puddle_radii[i]: + reward -= self.puddle_cost + return reward + + def _is_goal(self, state): + return state[0] >= self.goal_state[0] - self.goal_width and \ + state[1] >= self.goal_state[1] - self.goal_width + + def _build_puddle(self, center, radius): + self.puddle_centers.append(center) + self.puddle_radii.append(radius) + + def _render(self, mode='human', close=False): + pass diff --git a/Examples/ReinforcementLearning/deeprl/scripts/config_examples/policy_gradient.config b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/policy_gradient.config new file mode 100644 index 000000000000..218b7db8b20c --- /dev/null +++ b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/policy_gradient.config @@ -0,0 +1,35 @@ +# See cntk.contrib.deeprl.agent.shared.policy_gradient_parameters for detailed +# explanation of each parameter. + +[General] +Agent = actor_critic +Gamma = 0.99 +# PreProcessing = cntk.contrib.deeprl.agent.shared.preprocessing.AtariPreprocessing +# PreProcessingArgs = (4,) + +[PolicyGradient] +SharedRepresentation = False +# PolicyRepresentation/ValueFunctionRepresentation can be nn, or some +# customized model defined as module_name.method_name, e.g. +# PolicyRepresentation = cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn +PolicyRepresentation = nn +InitialPolicy = +# ValueFunctionRepresentation is ignored when SharedRepresentation is true +ValueFunctionRepresentation = nn +UpdateFrequency = 32 +RelativeStepSize = 0.5 +RegularizationWeight = 0.001 + +[NetworkModel] +# Use (a list of integers) when PolicyRepresentation is nn +PolicyNetworkHiddenLayerNodes = [20] + +# Use (a list of integers) when ValueFunctionRepresentation is nn, ignored when +# SharedRepresentation is true +ValueNetworkHiddenLayerNodes = [20] + +[Optimization] +Momentum = 0.95 +InitialEta = 0.01 +EtaDecayStepCount = 10000 +EtaMinimum = 0.01 diff --git a/Examples/ReinforcementLearning/deeprl/scripts/config_examples/qlearning.config b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/qlearning.config new file mode 100644 index 000000000000..9d3b692c54bd --- /dev/null +++ b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/qlearning.config @@ -0,0 +1,46 @@ +# See cntk.contrib.deeprl.agent.shared.qlearning_parameters for detailed +# explanation of each parameter. + +[General] +Agent = qlearning +Gamma = 0.99 +# PreProcessing = cntk.contrib.deeprl.agent.shared.preprocessing.AtariPreprocessing +# PreProcessingArgs = (4,) + +[QLearningAlgo] +InitialEpsilon = 1.0 +EpsilonDecayStepCount = 10000 +EpsilonMinimum = 0.01 +InitialQ = 0.0 +TargetQUpdateFrequency = 100 +QUpdateFrequency = 4 +MinibatchSize = 32 +# QRepresentation can be 'dqn', 'dueling-dqn', or some customized model defined as +# module_name.method_name, e.g. +# QRepresentation = cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn +QRepresentation = dqn +ErrorClipping = False +ReplaysPerUpdate = 1 + +[ExperienceReplay] +Capacity = 500 +StartSize = 100 +Prioritized = True +PriorityAlpha = 0.7 +PriorityBeta = 1 +PriorityEpsilon = 0.0001 + +[NetworkModel] +# Use (a list of integers) when QRepresentation is 'dqn' +HiddenLayerNodes = [20] + +# Or use (a list of integers followed by two lists of integers) when +# QRepresentation is 'dueling-dqn' +; HiddenLayerNodes = [10, [5], [5]] + +[Optimization] +Momentum = 0.9 +InitialEta = 0.01 +EtaDecayStepCount = 10000 +EtaMinimum = 0.0001 +GradientClippingThreshold = 10 diff --git a/Examples/ReinforcementLearning/deeprl/scripts/config_examples/tabular_qlearning.config b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/tabular_qlearning.config new file mode 100644 index 000000000000..1367ae374867 --- /dev/null +++ b/Examples/ReinforcementLearning/deeprl/scripts/config_examples/tabular_qlearning.config @@ -0,0 +1,17 @@ +# See cntk.contrib.deeprl.agent.shared.qlearning_parameters for detailed +# explanation of each parameter. + +[General] +Agent = tabular_qlearning +Gamma = 0.99 + +[QLearningAlgo] +InitialEpsilon = 1.0 +EpsilonDecayStepCount = 100000 +EpsilonMinimum = 0.01 +InitialEta = 0.5 +EtaDecayStepCount = 100000 +EtaMinimum = 0.1 +InitialQ = 0.0 +DiscretizationResolution = 10 +QRepresentation = tabular diff --git a/Examples/ReinforcementLearning/deeprl/scripts/run.py b/Examples/ReinforcementLearning/deeprl/scripts/run.py new file mode 100644 index 000000000000..50238dea1434 --- /dev/null +++ b/Examples/ReinforcementLearning/deeprl/scripts/run.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import argparse +import os +import shelve +import sys +import time +from contextlib import closing + +import numpy as np +from gym import envs +from gym.envs.atari.atari_env import AtariEnv + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) +from cntk.contrib.deeprl.agent import agent_factory +from env import env_factory + + +def new_episode(): + """Start a new episode. + + For Atari games, perform no-op actions at the beginning of the episode. + """ + observation = env.reset() + if args.render: + env.render() + if isinstance(env.env, AtariEnv): + for t in range(args.num_noop): + observation, reward, isTerminal, _ = env.step(0) + if isTerminal: + print('WARNING: Terminal signal received after {0} steps' + ''.format(t)) + if args.render: + env.render() + return observation + + +def evaluate_agent_if_necessary(eval_count, start_time): + """Evaluate agent every --eval_period steps.""" + if agent.step_count >= eval_count * args.eval_period: + elapsed_time = time.time() - start_time + total_reward = 0 + num_episodes = 0 + episode_reward = 0 + i = 0 + agent.enter_evaluation() + + observation = new_episode() + while i < args.eval_steps: + i += 1 + action = agent.evaluate(observation) + observation, reward, isTerminal, _ = env.step(action) + if args.render: + env.render() + episode_reward += reward + if isTerminal: + num_episodes += 1 + total_reward += episode_reward + episode_reward = 0 + observation = new_episode() + + reward = episode_reward if num_episodes == 0 \ + else total_reward / num_episodes + print('\nAverage reward per episode after training {0} steps: {1}\n' + ''.format(agent.step_count, reward)) + if len(reward_history) == 0 or reward > max(reward_history): + agent.set_as_best_model() + reward_history.append(reward) + if len(training_time) != 0: + elapsed_time += training_time[-1] + training_time.append(elapsed_time) + + # Save results and update eval_count. + filename_prefix = os.path.join(args.output_dir, args.output_dir) + agent.save(filename_prefix + '.model') + with closing(shelve.open(filename_prefix + '.wks', + 'n' if eval_count == 1 else 'c', + 0, + True)) as shelf: + if 'step_count' not in shelf: + shelf['step_count'] = [] + shelf['step_count'].append(agent.step_count) + shelf['reward_history'] = reward_history + shelf['training_time_sec'] = training_time + agent.exit_evaluation() + eval_count += 1 + start_time = time.time() + + return eval_count, start_time + + +if __name__ == '__main__': + # Parse input arguments. + parser = argparse.ArgumentParser() + parser.add_argument('--env', type=str, default='CartPole-v0', + help='Environment that agent iteracts with.') + parser.add_argument('--num_noop', type=int, default=30, help='Number of ' + 'no-op actions to be performed by the agent at the ' + 'start of an episode, for Atari environment only.') + parser.add_argument('--agent_config', type=str, default='', + help='Config file for agent.') + parser.add_argument('--max_steps', type=int, default=1000000, + help='Maximum steps to train an agent.') + parser.add_argument('--max_episode_steps', type=int, default=0, + help='Maximum steps per episode. Use environment ' + 'specific value if 0.') + parser.add_argument('--eval_period', type=int, default=250000, + help='Number of steps taken between each evaluation.') + parser.add_argument('--eval_steps', type=int, default=125000, + help='Number of steps taken during each evaluation.') + parser.add_argument('--verbose', action='store_true', help='Output debug ' + 'info if set to True.') + parser.add_argument('--output_dir', type=str, default='output', + help='Directory where workspace file and model file ' + 'are saved to. Model file will be named as ' + 'output_dir.model, and workspace file will be named ' + 'as output_dir.wks.') + parser.add_argument('--render', action='store_true', help='Render ' + 'environment if set to True.') + parser.add_argument('--seed', type=int, default=1234567, help='Seed for ' + 'random number generator. Negative value is ignored.') + args = parser.parse_args() + + if (args.seed >= 0): + np.random.seed(args.seed) + + # Use xrange for python 2.7 to speed up. + if sys.version_info.major < 3: + range = xrange + + # Create an OpenAI Gym environment, and obtain its state/action + # information. + if args.env not in envs.registry.env_specs.keys(): + # Try to find from local environment libraries. + env_factory.register_env(args.env) + env = envs.make(args.env) + o_space = env.observation_space + a_space = env.action_space + image_observation = True if isinstance( + env.env, AtariEnv) and env.env._obs_type == 'image' else False + print("Loaded environment '{0}'".format(args.env)) + print("Observation space: '{0}'".format(o_space)) + print("Action space: '{0}'".format(a_space)) + print('Is observation an image: {0}'.format(image_observation)) + + if args.max_episode_steps <= 0: + args.max_episode_steps = \ + env.spec.tags['wrapper_config.TimeLimit.max_episode_steps'] + + # Create an agent. + agent = agent_factory.make_agent(args.agent_config, + o_space, + a_space) + + # Create output folder, and save current parameter settings. + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + agent.save_parameter_settings( + os.path.join(args.output_dir, args.output_dir + '.params')) + + eval_count = 1 + reward_history = [] + training_time = [] + start_time = time.time() + # Stop when maximum number of steps are reached. + while agent.step_count < args.max_steps: + # Evaluate agent every --eval_period steps. + eval_count, start_time = evaluate_agent_if_necessary( + eval_count, start_time) + # Learn from new episode. + observation = new_episode() + action, debug_info = agent.start(observation) + rewards = 0 + steps = 0 + for t in range(args.max_episode_steps): + observation, reward, isTerminal, _ = env.step(action) + if args.render: + env.render() + if args.verbose: + print('\tStep\t{0}\t/\tAction\t{1},{2}\t/\tReward\t{3}' + ''.format( + agent.step_count, + action, + debug_info.get('action_behavior'), + reward)) + rewards += reward + steps += 1 + if isTerminal: + agent.end(reward, observation) + break + action, debug_info = agent.step(reward, observation) + print('Episode {0}\t{1}/{2} steps\t{3} total reward\tterminated = {4}' + ''.format( + agent.episode_count, steps, agent.step_count, rewards, isTerminal)) + sys.stdout.flush() + env.close() diff --git a/Source/ActionsLib/NetworkDescriptionLanguage.cpp b/Source/ActionsLib/NetworkDescriptionLanguage.cpp index 63b00e11f56a..7094dd8656a2 100644 --- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp +++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp @@ -176,6 +176,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable) else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true; else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceNode), L"CosDist")) ret = true; else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceWithNegativeSamplesNode), L"CosWithNegSamples")) ret = true; + else if (EqualInsensitive(nodeType, OperationNameOf(CoshNode), L"Cosh")) ret = true; else if (EqualInsensitive(nodeType, OperationNameOf(CosineNode), L"Cos")) ret = true; else if (EqualInsensitive(nodeType, OperationNameOf(CrossEntropyNode))) ret = true; else if (EqualInsensitive(nodeType, OperationNameOf(CrossEntropyWithSoftmaxNode), L"CEWithSM")) ret = true; @@ -224,6 +225,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable) #endif else if (EqualInsensitive(nodeType, OperationNameOf(SequenceWithSoftmaxNode), L"SEWithSM")) ret = true; else if (EqualInsensitive(nodeType, OperationNameOf(SigmoidNode))) ret = true; + else if (EqualInsensitive(nodeType, OperationNameOf(SinhNode))) ret = true; else if (EqualInsensitive(nodeType, OperationNameOf(SinNode))) ret = true; else if (EqualInsensitive(nodeType, OperationNameOf(SoftmaxNode))) ret = true; else if (EqualInsensitive(nodeType, OperationNameOf(SparseInputValue), L"SparseInput")) ret = true; diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs index 4dff9b81a895..2c94913c24e6 100644 --- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs +++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs @@ -645,6 +645,7 @@ ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new Computa // TODO: ColumnElementTimes = ElementTimes CosDistance(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'CosDistance' ; inputs = _AsNodes (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ] CosDistanceWithNegativeSamples(aVectorSequence, anotherVectorSequence, numShifts, numNegSamples, tag='') = new ComputationNode [ operation = 'CosDistanceWithNegativeSamples' ; inputs = _AsNodes (aVectorSequence : anotherVectorSequence : numShifts : numNegSamples) /*plus the function args*/ ] +Cosh(x, tag='') = new ComputationNode [ operation = 'Cosh' ; inputs = _AsNodes (x) /*plus the function args*/ ] Cosine(x, tag='') = new ComputationNode [ operation = 'Cosine' ; inputs = _AsNodes (x) /*plus the function args*/ ] CrossEntropy(refProbVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropy' ; inputs = _AsNodes (refProbVectorSequence : outProbVectorSequence) /*plus the function args*/ ] DiagTimes(diagonalMatrixAsColumnVector, matrix, tag='') = new ComputationNode [ operation = 'DiagTimes' ; inputs = _AsNodes (diagonalMatrixAsColumnVector : matrix) /*plus the function args*/ ] @@ -674,6 +675,7 @@ Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = ' # TODO: Scale = ElementTimes ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = _AsNodes (cond : indexSequence : sourceData) /*plus the function args*/ ] Sin(z, tag='') = new ComputationNode [ operation = 'Sin' ; inputs = _AsNodes (z) /*plus the function args*/ ] +Sinh(x, tag='') = new ComputationNode [ operation = 'Sinh' ; inputs = _AsNodes (x) /*plus the function args*/ ] Softmax (z, axis=0, tag='') = # TODO: replace this with more efficient version below once we have ReduceLogSum if axis == 0 then new ComputationNode [ operation = 'Softmax' ; inputs = _AsNodes (z) /*plus the function args*/ ] else diff --git a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h index b1cd71455be8..f0273b47c8df 100644 --- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h +++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h @@ -3582,6 +3582,16 @@ namespace CNTK /// CNTK_API FunctionPtr Cos(const Variable& operand, const std::wstring& name = L""); + /// + /// Create an instance of the CNTK built-in elementwise cosh operation with the specified input operand. + /// + CNTK_API FunctionPtr Cosh(const Variable& operand, const std::wstring& name = L""); + + /// + /// Create an instance of the CNTK built-in elementwise sinh operation with the specified input operand. + /// + CNTK_API FunctionPtr Sinh(const Variable& operand, const std::wstring& name = L""); + /// /// Create an instance of the CNTK built-in elementwise linear rectifier operation with the specified input operand. /// diff --git a/Source/CNTKv2LibraryDll/BackCompat.cpp b/Source/CNTKv2LibraryDll/BackCompat.cpp index d528c5c1b2b4..79a29bb5c11a 100644 --- a/Source/CNTKv2LibraryDll/BackCompat.cpp +++ b/Source/CNTKv2LibraryDll/BackCompat.cpp @@ -146,6 +146,10 @@ namespace CNTK opType = PrimitiveOpType::Cos; else if (node->OperationName() == OperationNameOf(SinNode)) opType = PrimitiveOpType::Sin; + else if (node->OperationName() == OperationNameOf(CoshNode)) + opType = PrimitiveOpType::Cosh; + else if (node->OperationName() == OperationNameOf(SinhNode)) + opType = PrimitiveOpType::Sinh; else if (node->OperationName() == OperationNameOf(PassNode)) opType = PrimitiveOpType::Pass; else if (node->OperationName() == OperationNameOf(LabelsToGraphNode)) diff --git a/Source/CNTKv2LibraryDll/CompositeFunction.cpp b/Source/CNTKv2LibraryDll/CompositeFunction.cpp index cfc106f3be77..4c15309ec87b 100755 --- a/Source/CNTKv2LibraryDll/CompositeFunction.cpp +++ b/Source/CNTKv2LibraryDll/CompositeFunction.cpp @@ -628,6 +628,12 @@ namespace CNTK case PrimitiveOpType::Sin: computationNodePtr = New>(network->GetDeviceId(), internalNodeName); break; + case PrimitiveOpType::Cosh: + computationNodePtr = New>(network->GetDeviceId(), internalNodeName); + break; + case PrimitiveOpType::Sinh: + computationNodePtr = New>(network->GetDeviceId(), internalNodeName); + break; case PrimitiveOpType::ReLU: computationNodePtr = New>(network->GetDeviceId(), internalNodeName); break; diff --git a/Source/CNTKv2LibraryDll/Function.cpp b/Source/CNTKv2LibraryDll/Function.cpp index 64594ecb4f9a..9960148152ac 100755 --- a/Source/CNTKv2LibraryDll/Function.cpp +++ b/Source/CNTKv2LibraryDll/Function.cpp @@ -1050,6 +1050,16 @@ namespace CNTK return UnaryOp(PrimitiveOpType::Cos, operand, Dictionary(), name); } + FunctionPtr Cosh(const Variable& operand, const std::wstring& name) + { + return UnaryOp(PrimitiveOpType::Cosh, operand, Dictionary(), name); + } + + FunctionPtr Sinh(const Variable& operand, const std::wstring& name) + { + return UnaryOp(PrimitiveOpType::Sinh, operand, Dictionary(), name); + } + FunctionPtr ReLU(const Variable& operand, const std::wstring& name) { return UnaryOp(PrimitiveOpType::ReLU, operand, Dictionary(), name); diff --git a/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp b/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp index fb12b474d363..72b5c582591a 100644 --- a/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp +++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp @@ -363,6 +363,8 @@ namespace CNTK case PrimitiveOpType::LogSoftmax: case PrimitiveOpType::Sin: case PrimitiveOpType::Cos: + case PrimitiveOpType::Cosh: + case PrimitiveOpType::Sinh: case PrimitiveOpType::Pass: case PrimitiveOpType::LabelsToGraph: case PrimitiveOpType::StopGradient: diff --git a/Source/CNTKv2LibraryDll/PrimitiveFunction.h b/Source/CNTKv2LibraryDll/PrimitiveFunction.h index 36502515b835..9ebb3d4976e5 100644 --- a/Source/CNTKv2LibraryDll/PrimitiveFunction.h +++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.h @@ -84,6 +84,8 @@ namespace CNTK {PrimitiveOpType::CosDistance, L"CosDistance"}, {PrimitiveOpType::Sin, L"Sin"}, {PrimitiveOpType::Cos, L"Cos"}, + {PrimitiveOpType::Cosh, L"Cosh"}, + {PrimitiveOpType::Sinh, L"Sinh"}, {PrimitiveOpType::Pass, L"Pass"}, {PrimitiveOpType::Block, L"Block"}, {PrimitiveOpType::Unpooling, L"Unpooling"}, diff --git a/Source/CNTKv2LibraryDll/PrimitiveOpType.h b/Source/CNTKv2LibraryDll/PrimitiveOpType.h index 01b886648c0d..d4b9773286f0 100644 --- a/Source/CNTKv2LibraryDll/PrimitiveOpType.h +++ b/Source/CNTKv2LibraryDll/PrimitiveOpType.h @@ -86,6 +86,8 @@ namespace CNTK Gather = 74, StableSigmoid = 75, RandomDistribution = 76, + Sinh = 77, + Cosh = 78, // New op types should only be appended to the end of this list UnknownOP // and UnknownOP should always be last. diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp index c8487eeaea9d..8ea0db5c8415 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp @@ -45,6 +45,7 @@ static shared_ptr> CreateStandardNode(const std::wstri else if (nodeType == OperationNameOf(ClipNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(CosDistanceNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(CosDistanceWithNegativeSamplesNode)) return New>(forward<_Types>(_Args)...); + else if (nodeType == OperationNameOf(CoshNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(CosineNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(CropNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(CrossEntropyNode)) return New>(forward<_Types>(_Args)...); @@ -116,6 +117,7 @@ static shared_ptr> CreateStandardNode(const std::wstri else if (nodeType == OperationNameOf(SigmoidNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(StableSigmoidNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(SinNode)) return New>(forward<_Types>(_Args)...); + else if (nodeType == OperationNameOf(SinhNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(SliceNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(SoftmaxNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(SqrtNode)) return New>(forward<_Types>(_Args)...); @@ -645,6 +647,18 @@ shared_ptr> ComputationNetworkBuilder::Sin(c return net.AddNodeToNetAndAttachInputs(New>(net.GetDeviceId(), nodeName), { a }); } +template +shared_ptr> ComputationNetworkBuilder::Cosh(const ComputationNodePtr a, const std::wstring nodeName) +{ + return net.AddNodeToNetAndAttachInputs(New>(net.GetDeviceId(), nodeName), { a }); +} + +template +shared_ptr> ComputationNetworkBuilder::Sinh(const ComputationNodePtr a, const std::wstring nodeName) +{ + return net.AddNodeToNetAndAttachInputs(New>(net.GetDeviceId(), nodeName), { a }); +} + template shared_ptr> ComputationNetworkBuilder::Abs(const ComputationNodePtr a, const std::wstring nodeName) { diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h index 5224ab6a7b2f..13b6ae9934e0 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h +++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h @@ -124,6 +124,7 @@ class ComputationNetworkBuilder ComputationNodePtr Clip(const ComputationNodePtr a, const ComputationNodePtr b, const ComputationNodePtr c, const std::wstring nodeName = L""); ComputationNodePtr Cos(const ComputationNodePtr a, const std::wstring nodeName = L""); ComputationNodePtr CosDistance(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L""); + ComputationNodePtr Cosh(const ComputationNodePtr a, const std::wstring nodeName = L""); ComputationNodePtr CrossEntropy(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L""); ComputationNodePtr CrossEntropyWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L""); ComputationNodePtr ForwardBackward(const ComputationNodePtr graph, const ComputationNodePtr features, int blankTokenId, int delayConstraint, const std::wstring nodeName = L""); @@ -179,6 +180,7 @@ class ComputationNetworkBuilder ComputationNodePtr SequenceWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction, const ComputationNodePtr loglikelihood, const std::wstring nodeName = L""); ComputationNodePtr Sigmoid(const ComputationNodePtr a, const std::wstring nodeName = L""); ComputationNodePtr Sin(const ComputationNodePtr a, const std::wstring nodeName = L""); + ComputationNodePtr Sinh(const ComputationNodePtr a, const std::wstring nodeName = L""); ComputationNodePtr Softmax(const ComputationNodePtr a, const std::wstring nodeName = L""); ComputationNodePtr Sqrt(const ComputationNodePtr a, const std::wstring nodeName = L""); ComputationNodePtr SquareError(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L""); diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h index 09ec5acb25d9..501bf21ea316 100644 --- a/Source/ComputationNetworkLib/NonlinearityNodes.h +++ b/Source/ComputationNetworkLib/NonlinearityNodes.h @@ -114,6 +114,8 @@ class UnaryElementWiseWithOpCodeNodeBase : public ComputationNode, pub // FloorNode (input) // CosineNode (input) // SinNode (input) +// CoshNode (input) +// SinhNode (input) // Abs(input) // Negate (input) // Sqrt (input) @@ -145,6 +147,7 @@ class UnaryElementWiseWithOpCodeNodeBase : public ComputationNode, pub // Name Forward and Backward opcodes Gradient optype DeclareUnaryElementWiseWithOpCodeNode(Abs, Abs, ElementwiseProductWithAbsDerivative, binaryWithInputGradient); +DeclareUnaryElementWiseWithOpCodeNode(Cosh, Cosh, ElementwiseProductWithCoshDerivative, binaryWithInputGradient); DeclareUnaryElementWiseWithOpCodeNode(Cosine, Cosine, ElementwiseProductWithCosDerivative, binaryWithInputGradient); DeclareUnaryElementWiseWithOpCodeNode(Exp, Exp, ElementwiseProduct, binaryWithOutputGradient); DeclareUnaryElementWiseWithOpCodeNode(Floor, Floor, None, noGradient); @@ -156,6 +159,7 @@ DeclareUnaryElementWiseWithOpCodeNode(Reciprocal, Reciprocal, DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear, LinearRectifier, ElementwiseProductWithLinearRectifierDerivativeFromOutput, binaryWithOutputGradient); DeclareUnaryElementWiseWithOpCodeNode(Sigmoid, Sigmoid, ElementwiseProductWithSigmoidDerivativeFromOutput, binaryWithOutputGradient); DeclareUnaryElementWiseWithOpCodeNode(Sin, Sin, ElementwiseProductWithSinDerivative, binaryWithInputGradient); +DeclareUnaryElementWiseWithOpCodeNode(Sinh, Sinh, ElementwiseProductWithSinhDerivative, binaryWithInputGradient); DeclareUnaryElementWiseWithOpCodeNode(Sqrt, Sqrt, ElementwiseProductWithSqrtDerivative, binaryWithOutputGradient); DeclareUnaryElementWiseWithOpCodeNode(Tanh, Tanh, ElementwiseProductWithTanhDerivativeFromOutput, binaryWithOutputGradient); DeclareUnaryElementWiseWithOpCodeNode(ExponentialLinearUnit, ExponentialLinearUnit, ElementwiseProductWithExponentialLinearUnitDerivativeFromOutput, binaryWithOutputGradient); diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h index 9c2b60a14fe6..f3204010fb7f 100755 --- a/Source/Math/CPUMatrix.h +++ b/Source/Math/CPUMatrix.h @@ -256,6 +256,12 @@ class MATH_API CPUMatrix : public BaseMatrix CPUMatrix& InplaceNegativeSine(); CPUMatrix& AssignNegativeSineOf(const CPUMatrix& a); + CPUMatrix& InplaceCosh(); + CPUMatrix& AssignCoshOf(const CPUMatrix& a); + + CPUMatrix& InplaceSinh(); + CPUMatrix& AssignSinhOf(const CPUMatrix& a); + CPUMatrix& InplaceAbs(); CPUMatrix& AssignAbsOf(const CPUMatrix& a); diff --git a/Source/Math/CPUMatrixImpl.h b/Source/Math/CPUMatrixImpl.h index 7d1ed50dc696..6ed53e616c5f 100644 --- a/Source/Math/CPUMatrixImpl.h +++ b/Source/Math/CPUMatrixImpl.h @@ -2726,6 +2726,60 @@ CPUMatrix& CPUMatrix::AssignNegativeSineOf(const CPUMatrix +CPUMatrix& CPUMatrix::InplaceCosh() +{ + return AssignCoshOf(*this); +} + +template +CPUMatrix& CPUMatrix::AssignCoshOf(const CPUMatrix& a) +{ + if (a.IsEmpty()) + LogicError("AssignCoshOf: Matrix a is empty."); + + auto& us = *this; + if (this != &a) + RequireSize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_coord (i, j, a) + { + const ElemType v = a(i, j); + us(i, j) = cosh(v); + } + + return *this; +} + +//[this]=sinh([this]) element wise +template +CPUMatrix& CPUMatrix::InplaceSinh() +{ + return AssignSinhOf(*this); +} + +template +CPUMatrix& CPUMatrix::AssignSinhOf(const CPUMatrix& a) +{ + if (a.IsEmpty()) + LogicError("AssignSinhOf: Matrix a is empty."); + + auto& us = *this; + if (this != &a) + RequireSize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_coord (i, j, a) + { + const ElemType v = a(i, j); + us(i, j) = sinh(v); + } + + return *this; +} + //Threshold truncating: this[i] = max( this[i], threshold ) template CPUMatrix& CPUMatrix::InplaceTruncateBottom(const ElemType threshold) diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h index dfa07c8aa2c9..5e6d8a91b8a6 100644 --- a/Source/Math/CommonMatrix.h +++ b/Source/Math/CommonMatrix.h @@ -85,7 +85,7 @@ enum ElementWiseOperator // unary (or binary with constant parameter) opCopy, opNegate, opNot, opAbs, opFloor, opReciprocal, - opSigmoid, opTanh, opSqr, opSqrt, opExp, opLog, opLinearRectifier, opCosine, opSin, opExponentialLinearUnit, opStableSigmoid, + opSigmoid, opTanh, opSqr, opSqrt, opExp, opLog, opLinearRectifier, opCosine, opSin, opCosh, opSinh, opExponentialLinearUnit, opStableSigmoid, // unary ops for use by Matrix class only (there is no TensorView implementation) opSigmoidDerivative, opLinearRectifierDerivative, opNegativeSine, opExponentialLinearUnitDerivative, opStableSigmoidDerivative, // binary @@ -96,6 +96,7 @@ enum ElementWiseOperator opElementwiseProductWithSigmoidDerivativeFromOutput, opElementwiseProductWithTanhDerivativeFromOutput, opElementwiseProductWithLinearRectifierDerivativeFromOutput, opElementwiseProductWithLogDerivativeFromOutput, opElementwiseProductWithCosDerivative, opElementwiseProductWithSinDerivative, + opElementwiseProductWithCoshDerivative, opElementwiseProductWithSinhDerivative, opElementwiseProductWithAbsDerivative, opElementwiseProductWithSqrtDerivative, opElementwiseProductWithReciprocalDerivative, opSqrOfDifference, opElementwiseProductWithExponentialLinearUnitDerivativeFromOutput, @@ -133,6 +134,8 @@ enum ElementWiseOperator Macro(LinearRectifier); \ Macro(Cosine); \ Macro(Sin); \ + Macro(Cosh); \ + Macro(Sinh); \ Macro(ExponentialLinearUnit); \ Macro(StableSigmoid); @@ -163,6 +166,8 @@ enum ElementWiseOperator Macro(ElementwiseProductWithLogDerivativeFromOutput); \ Macro(ElementwiseProductWithCosDerivative); \ Macro(ElementwiseProductWithSinDerivative); \ + Macro(ElementwiseProductWithCoshDerivative); \ + Macro(ElementwiseProductWithSinhDerivative); \ Macro(ElementwiseProductWithAbsDerivative); \ Macro(ElementwiseProductWithReciprocalDerivative); \ Macro(ElementwiseProductWithSqrtDerivative); \ diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu index 221ec378c397..43f55592844f 100755 --- a/Source/Math/GPUMatrix.cu +++ b/Source/Math/GPUMatrix.cu @@ -458,6 +458,10 @@ void GPUMatrix::performElementWiseFunction(ElementWiseOperator kind, c return _elementWiseCosineOnCuda<<>>(src, Data(), N); case ElementWiseOperator::opNegativeSine: return _elementWiseNegativeSineOnCuda<<>>(src, Data(), N); + case ElementWiseOperator::opCosh: + return _elementWiseCoshOnCuda<<>>(src, Data(), N); + case ElementWiseOperator::opSinh: + return _elementWiseSinhOnCuda<<>>(src, Data(), N); case ElementWiseOperator::opSigmoidDerivative: return _elementWiseSigmoidDerivativeOnCuda<<>>(src, Data(), N); default: LogicError("performElementWiseFunction: unexpected op code %d", (int)kind); @@ -2333,6 +2337,12 @@ DEF_ELEMWISE_ASSIGN_FUNC(Cosine) DEF_ELEMWISE_INPLACE_FUNC(NegativeSine) DEF_ELEMWISE_ASSIGN_FUNC(NegativeSine) +DEF_ELEMWISE_INPLACE_FUNC(Cosh) +DEF_ELEMWISE_ASSIGN_FUNC(Cosh) + +DEF_ELEMWISE_INPLACE_FUNC(Sinh) +DEF_ELEMWISE_ASSIGN_FUNC(Sinh) + template GPUMatrix& GPUMatrix::InplaceTruncateBottom(const ElemType threshold) { diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h index 39ebd5ce1461..369fef96f92e 100755 --- a/Source/Math/GPUMatrix.h +++ b/Source/Math/GPUMatrix.h @@ -380,6 +380,12 @@ class MATH_API GPUMatrix : public BaseMatrix GPUMatrix& InplaceNegativeSine(); GPUMatrix& AssignNegativeSineOf(const GPUMatrix& a); + GPUMatrix& InplaceCosh(); + GPUMatrix& AssignCoshOf(const GPUMatrix& a); + + GPUMatrix& InplaceSinh(); + GPUMatrix& AssignSinhOf(const GPUMatrix& a); + GPUMatrix& InplaceAbs(); GPUMatrix& AssignAbsOf(const GPUMatrix& a); diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh index 61e9e2a8ce20..4a6e0d6fdf84 100755 --- a/Source/Math/GPUMatrixCUDAKernels.cuh +++ b/Source/Math/GPUMatrixCUDAKernels.cuh @@ -378,6 +378,26 @@ __global__ void _elementWiseNegativeSineOnCuda( res[id] = -sin_(a[id]); }; +template +__global__ void _elementWiseCoshOnCuda( + const ElemType* a, + ElemType* res, + const CUDA_LONG N) +{ + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); + res[id] = cosh_(a[id]); +}; + +template +__global__ void _elementWiseSinhOnCuda( + const ElemType* a, + ElemType* res, + const CUDA_LONG N) +{ + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); + res[id] = sinh_(a[id]); +}; + template __global__ void _setValue( ElemType* a, diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index 2e62ff501240..30f0045db119 100755 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -3133,6 +3133,72 @@ Matrix& Matrix::AssignNegativeSineOf(const Matrix& return *this; } +//[this]=cosh([this]) element wise +template +Matrix& Matrix::InplaceCosh() +{ + DISPATCH_MATRIX_ON_FLAG(this, + this, + m_CPUMatrix->InplaceCosh(), + m_GPUMatrix->InplaceCosh(), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED); + + return *this; +} + +template +Matrix& Matrix::AssignCoshOf(const Matrix& a) +{ + if (a.IsEmpty()) + LogicError("AssignCoshOf: Matrix a is empty."); + + DecideAndMoveToRightDevice(a, *this); + SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); + + DISPATCH_MATRIX_ON_FLAG(&a, + this, + m_CPUMatrix->AssignCoshOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignCoshOf(*a.m_GPUMatrix), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED); + + return *this; +} + +//[this]=sinh([this]) element wise +template +Matrix& Matrix::InplaceSinh() +{ + DISPATCH_MATRIX_ON_FLAG(this, + this, + m_CPUMatrix->InplaceSinh(), + m_GPUMatrix->InplaceSinh(), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED); + + return *this; +} + +template +Matrix& Matrix::AssignSinhOf(const Matrix& a) +{ + if (a.IsEmpty()) + LogicError("AssignSinhOf: Matrix a is empty."); + + DecideAndMoveToRightDevice(a, *this); + SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); + + DISPATCH_MATRIX_ON_FLAG(&a, + this, + m_CPUMatrix->AssignSinhOf(*a.m_CPUMatrix), + m_GPUMatrix->AssignSinhOf(*a.m_GPUMatrix), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED); + + return *this; +} + template Matrix& Matrix::InplaceTruncate(const ElemType threshold) { diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index 72ef97bb524e..514f29a32a27 100755 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -407,6 +407,12 @@ class MATH_API Matrix : public MatrixBase Matrix& InplaceNegativeSine(); Matrix& AssignNegativeSineOf(const Matrix& a); + Matrix& InplaceCosh(); + Matrix& AssignCoshOf(const Matrix& a); + + Matrix& InplaceSinh(); + Matrix& AssignSinhOf(const Matrix& a); + Matrix& InplaceLog10(); Matrix& AssignLog10Of(const Matrix& a); diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp index 1fd0dd509663..3426471ee442 100755 --- a/Source/Math/NoGPU.cpp +++ b/Source/Math/NoGPU.cpp @@ -1535,6 +1535,30 @@ GPUMatrix& GPUMatrix::AssignNegativeSineOf(const GPUMatrix +GPUMatrix& GPUMatrix::InplaceCosh() +{ + return *this; +} + +template +GPUMatrix& GPUMatrix::AssignCoshOf(const GPUMatrix& /*a*/) +{ + return *this; +} + +template +GPUMatrix& GPUMatrix::InplaceSinh() +{ + return *this; +} + +template +GPUMatrix& GPUMatrix::AssignSinhOf(const GPUMatrix& /*a*/) +{ + return *this; +} + template GPUMatrix& GPUMatrix::InplaceTruncateBottom(const ElemType threshold) { diff --git a/Source/Math/TensorOps.h b/Source/Math/TensorOps.h index 26abdb5ca2be..7b2dd07b2629 100644 --- a/Source/Math/TensorOps.h +++ b/Source/Math/TensorOps.h @@ -49,6 +49,8 @@ OverloadUnaryMathFns(cos); OverloadUnaryMathFns(sin); OverloadUnaryMathFns(floor); OverloadUnaryMathFns(log1p); +OverloadUnaryMathFns(sinh); +OverloadUnaryMathFns(cosh); #pragma pop_macro("OverloadUnaryMathFns") @@ -271,6 +273,8 @@ DefUnaryOp(Sin, sin_(a)); DefUnaryOp(Reciprocal, a == 0 ? 0 : 1 / a); DefUnaryOp(ExponentialLinearUnit, a >= 0 ? a : (exp_(a)-1)); DefUnaryOp(StableSigmoid, StableSigmoid(a)); +DefUnaryOp(Sinh, sinh_(a)); +DefUnaryOp(Cosh, cosh_(a)); #pragma pop_macro("DefUnaryOp") #pragma push_macro("DefBinaryOp") @@ -312,6 +316,8 @@ DefBinaryOp(ElementwiseProductWithReciprocalDerivative, a * -Sqr(b)); // b = out DefBinaryOp(ElementwiseProductWithSqrtDerivative, a / (2 * b)); // b = output; d/dx sqrt(x) = 1/(2 * sqrt(x)) --> note this is the same as ElementwiseQuotient w a constant; if more show up like this we should add more template params DefBinaryOp(SqrOfDifference, Sqr(a - b)); DefBinaryOp(ElementwiseProductWithExponentialLinearUnitDerivativeFromOutput, b >= 0 ? a : a*(1+b)); // b = output; +DefBinaryOp(ElementwiseProductWithSinhDerivative, a * cosh_(b)); // note: b = input for sinh() +DefBinaryOp(ElementwiseProductWithCoshDerivative, a * sinh_(b)); // note: b = input for cosh() //DefBinaryOp(Index, IndexElement(a, b, i)); // note: this one uses the third argument #pragma pop_macro("DefBinaryOp") diff --git a/Source/Readers/HTKDeserializers/ConfigHelper.cpp b/Source/Readers/HTKDeserializers/ConfigHelper.cpp index 94df730c80fe..871c8bf975aa 100644 --- a/Source/Readers/HTKDeserializers/ConfigHelper.cpp +++ b/Source/Readers/HTKDeserializers/ConfigHelper.cpp @@ -7,6 +7,7 @@ #include "ConfigHelper.h" #include "DataReader.h" #include "StringUtil.h" +#include namespace CNTK { @@ -164,9 +165,16 @@ vector ConfigHelper::GetMlfPaths() const } wstring list = m_config(L"mlfFileList"); - for (msra::files::textreader r(list); r;) + if (list.find(':') == string::npos) { - result.push_back(r.wgetline()); + for (msra::files::textreader r(list); r;) + { + result.push_back(r.wgetline()); + } + } + else + { + result = m_config(L"mlfFileList", ConfigParameters::Array(stringargvector(vector{}))); } } diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/deeprl_test.py b/Tests/EndToEndTests/CNTKv2Python/Examples/deeprl_test.py new file mode 100644 index 000000000000..0de6f86b2e3e --- /dev/null +++ b/Tests/EndToEndTests/CNTKv2Python/Examples/deeprl_test.py @@ -0,0 +1,41 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import os +import platform +import shelve +import shutil +import subprocess + +import pytest + + +def test_deeprl(): + if platform.system() != 'Linux': + pytest.skip('test only runs on Linux (Gym Atari dependency)') + + test_dir = os.path.dirname(os.path.abspath(__file__)) + script_dir = os.path.join(test_dir, '..', '..', '..', '..', 'Examples', + 'ReinforcementLearning', 'deeprl', 'scripts') + script_file = os.path.join(script_dir, 'run.py') + config_file = os.path.join(script_dir, 'config_examples', + 'qlearning.config') + + subprocess.call([ + 'python', script_file, '--env=CartPole-v0', '--max_steps=6000', + '--agent_config=' + config_file, '--eval_period=1000', + '--eval_steps=20000' + ]) + + assert os.path.exists( + os.path.join(test_dir, 'output', 'output.params')) == True + + wks = shelve.open(os.path.join(test_dir, 'output', 'output.wks')) + rewards = wks['reward_history'] + assert len(rewards) >= 5 and len(rewards) <= 6 + assert max(rewards) >= 120 + + shutil.rmtree(os.path.join(test_dir, 'output')) diff --git a/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py b/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py index 18e6b5d3b4f6..4b70bf29aa71 100644 --- a/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py +++ b/Tests/EndToEndTests/CNTKv2Python/Examples/htk_deserializer_test.py @@ -6,7 +6,6 @@ abs_path = os.path.dirname(os.path.abspath(__file__)) data_path = os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Speech", "AN4", "Data") - def test_htk_deserializers(): mbsize = 640 epoch_size = 1000 * mbsize @@ -55,4 +54,28 @@ def test_htk_deserializers(): assert True os.chdir(abs_path) -#test_htk_deserializers() + +def test_multiple_mlf_files(): + os.chdir(data_path) + + feature_dim = 33 + num_classes = 132 + context = 2 + + test_mlf_path = "../../../../Tests/EndToEndTests/Speech/Data/glob_00001.mlf" + + features_file = "glob_0000.scp" + label_files = [ "glob_0000.mlf", test_mlf_path] + label_mapping_file = "state.list" + + fd = HTKFeatureDeserializer(StreamDefs( + amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file))) + + ld = HTKMLFDeserializer(label_mapping_file, StreamDefs( + awesome_labels = StreamDef(shape=num_classes, mlf=label_files))) + + # Make sure we can read at least one minibatch. + mbsource = MinibatchSource([fd,ld]) + mbsource.next_minibatch(1) + + os.chdir(abs_path) diff --git a/Tests/EndToEndTests/Speech/Data/glob_00001.mlf b/Tests/EndToEndTests/Speech/Data/glob_00001.mlf new file mode 100644 index 000000000000..5f1c0b02a380 --- /dev/null +++ b/Tests/EndToEndTests/Speech/Data/glob_00001.mlf @@ -0,0 +1,5 @@ +#!MLF!# +"nonexistent.lab" +0 100000 sil[2] -0.785971 sil 454.794006 +100000 5500000 sil[3] 465.522034 +. diff --git a/Tests/UnitTests/MathTests/CPUMatrixTests.cpp b/Tests/UnitTests/MathTests/CPUMatrixTests.cpp index 5fbedf25b704..ed7ae41027e0 100755 --- a/Tests/UnitTests/MathTests/CPUMatrixTests.cpp +++ b/Tests/UnitTests/MathTests/CPUMatrixTests.cpp @@ -431,6 +431,26 @@ BOOST_FIXTURE_TEST_CASE(CPUMatrixElementOperations, RandomSeedFixture) m_NegSine.SetValue(m_Trig); m_NegSine.AssignNegativeSineOf(m_Trig); BOOST_CHECK(m_NegSine.IsEqualTo(m_NegSine_expected, c_epsilonFloatE4)); + + m3.SetValue(m0); + m3.InplaceCosh(); + m2(0, 0) = 1.54308063; + m2(0, 1) = 3.76219569; + m2(0, 2) = 10.067662; + m2(1, 0) = 27.30823284; + m2(1, 1) = 74.20994852; + m2(1, 2) = 201.71563612; + BOOST_CHECK(m3.IsEqualTo(m2, c_epsilonFloatE4)); + + m3.SetValue(m0); + m3.InplaceSinh(); + m2(0, 0) = 1.17520119; + m2(0, 1) = 3.62686041; + m2(0, 2) = 10.01787493; + m2(1, 0) = 27.2899172; + m2(1, 1) = 74.20321058; + m2(1, 2) = 201.71315737; + BOOST_CHECK(m3.IsEqualTo(m2, c_epsilonFloatE4)); } BOOST_FIXTURE_TEST_CASE(CPUMatrixNorms, RandomSeedFixture) diff --git a/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp b/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp index dd27feff3432..9acff559c442 100755 --- a/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp +++ b/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp @@ -338,7 +338,9 @@ void CheckEnumValuesNotModified() { static_cast(PrimitiveOpType::Assign) == 73 && static_cast(PrimitiveOpType::Gather) == 74 && static_cast(PrimitiveOpType::StableSigmoid) == 75 && - static_cast(PrimitiveOpType::RandomDistribution) == 76, + static_cast(PrimitiveOpType::RandomDistribution) == 76 && + static_cast(PrimitiveOpType::Sinh) == 77 && + static_cast(PrimitiveOpType::Cosh) == 78, "PrimitiveOpType enum value was modified."); } diff --git a/bindings/common/CNTKManagedCommon.i b/bindings/common/CNTKManagedCommon.i index 53a3132cd07a..0a0e3a9a3542 100644 --- a/bindings/common/CNTKManagedCommon.i +++ b/bindings/common/CNTKManagedCommon.i @@ -152,6 +152,8 @@ IGNORE_FUNCTION CNTK::Sigmoid; IGNORE_FUNCTION CNTK::Tanh; IGNORE_FUNCTION CNTK::Sin; IGNORE_FUNCTION CNTK::Cos; +IGNORE_FUNCTION CNTK::Cosh; +IGNORE_FUNCTION CNTK::Sinh; IGNORE_FUNCTION CNTK::ReLU; IGNORE_FUNCTION CNTK::Exp; IGNORE_FUNCTION CNTK::Log; diff --git a/bindings/python/cntk/contrib/deeprl/README.md b/bindings/python/cntk/contrib/deeprl/README.md new file mode 100644 index 000000000000..d1745ead8f36 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/README.md @@ -0,0 +1,26 @@ +CNTK DeepRL toolkit implements deep Q learning (and its variants) and actor-critic method. +Tabular Q learning and random agent are also provided for baseline comparison. + +The observation space and action space are represented by an OpenAI gym space type, see +https://github.com/openai/gym/tree/master/gym/spaces. Currently the toolkit limits +action space to be discrete https://github.com/openai/gym/blob/master/gym/spaces/discrete.py, +i.e., action is denoted by an integer between 0 and n-1 for n possible actions. +The observation space can be arbitrary expect Tuple https://github.com/openai/gym/blob/master/gym/spaces/tuple_space.py. + +An example script is provided at CNTK/Examples/ReinforcementLearning/deeprl/scripts/run.py, +which interacts with environment, and does training and evaluation. Training details +are specified via a configure file. See CNTK/Examples/ReinforcementLearning/deeprl/config_examples +for example configure file for deep Q learning and actor-critic method. + +For problem to be solved by deep RL algorithms, describe the problem as an environment following +the examples at CNTK/Examples/ReinforcementLearning/deeprl/env. + +References: +deep Q learning +- DQN https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf +- Prioritized Experience Replay https://arxiv.org/pdf/1511.05952.pdf +- Dueling Network https://arxiv.org/pdf/1511.06581.pdf +- Double Q Learning https://arxiv.org/pdf/1509.06461.pdf + +actor-critic +- Actor-Critic https://arxiv.org/pdf/1602.01783.pdf diff --git a/bindings/python/cntk/contrib/deeprl/__init__.py b/bindings/python/cntk/contrib/deeprl/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/bindings/python/cntk/contrib/deeprl/agent/__init__.py b/bindings/python/cntk/contrib/deeprl/agent/__init__.py new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/__init__.py @@ -0,0 +1 @@ + diff --git a/bindings/python/cntk/contrib/deeprl/agent/agent.py b/bindings/python/cntk/contrib/deeprl/agent/agent.py new file mode 100644 index 000000000000..7a758365f086 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/agent.py @@ -0,0 +1,231 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Base class for defining an agent.""" + +from abc import ABCMeta, abstractmethod + +import numpy as np + +from importlib import import_module + +from .shared.discretize import BoxSpaceDiscretizer + + +class AgentBaseClass(object): + """Base class for defining an agent.""" + + __metaclass__ = ABCMeta + + def __init__(self, o_space, a_space): + """ + Constructor for AgentBaseClass. + + Args: + o_space: observation space, gym.spaces.tuple_space.Tuple is not + supported. + a_space: action space, limits to gym.spaces.discrete.Discrete. + """ + if self._classname(a_space) != 'gym.spaces.discrete.Discrete': + raise ValueError( + 'Action space {0} incompatible with {1}. (Only supports ' + 'Discrete action spaces.)'.format(a_space, self)) + self._num_actions = a_space.n + + # We assume the observation is in one of the following cases: + # 1. discrete, and takes values from 0 to n - 1 + # 2. can be discretized, and the raw state is converted to an internal + # state taking values from 0 to n - 1 + # 3. raw, such as images from Atari games + # + # OpenAI gym supports the following observation types: + # Discrete, Box, MultiBinary, MultiDiscrete and Tuple. Discrete + # corresponds to case 1. Box, MultiBinary and MultiDiscrete can be + # either case 2 or 3. Tuple is a mix of case 1, 2 or 3, and is not + # supported currently. + # + # The observation-related parameters are defined as follows: + # _discrete_observation_space: True for cases 1 and 2, False otherwise. + # State is represented by a scalar. + # _space_discretizer: Not none for case 2 to indicate a conversion on + # state is requried. None otherwise. + # _shape_of_inputs: (n, ) for cases 1 and 2 to indicate it is a vector + # of length n. For case 3, it is the shape of array that represents + # the state. For example, an image input will have shape denoted as + # tuple (channel, width, height). + if not (self._classname(o_space) == 'gym.spaces.discrete.Discrete' or + self._classname(o_space) == 'gym.spaces.multi_binary.MultiBinary' or + self._classname(o_space) == 'gym.spaces.box.Box' or + self._classname(o_space) == 'gym.spaces.multi_discrete.MultiDiscrete'): + raise ValueError( + 'Unsupported observation space type: {0}'.format(o_space)) + + self._space_discretizer = None + self._discrete_observation_space = \ + (self._classname(o_space) == 'gym.spaces.discrete.Discrete') + # Set self._num_states for discrete observation space only. + # Otherwise set it to None so that an exception will be raised + # should it be used later in the code. + self._num_states = \ + o_space.n if self._discrete_observation_space else None + + if (self._classname(o_space) == 'gym.spaces.discrete.Discrete' or + self._classname(o_space) == 'gym.spaces.multi_binary.MultiBinary'): + self._shape_of_inputs = (o_space.n,) + else: + self._shape_of_inputs = o_space.shape + + self._preprocessor = None + self._best_model = None + + @abstractmethod + def start(self, state): + """ + Start a new episode. + + Args: + state (object): observation provided by the environment. + + Returns: + action (int): action choosen by agent. + debug_info (dict): auxiliary diagnostic information. + """ + pass + + @abstractmethod + def step(self, reward, next_state): + """ + Observe one transition and choose an action. + + Args: + reward (float) : amount of reward returned after previous action. + next_state (object): observation provided by the environment. + + Returns: + action (int): action choosen by agent. + debug_info (dict): auxiliary diagnostic information. + """ + pass + + @abstractmethod + def end(self, reward, next_state): + """ + Last observed reward/state of the episode (which then terminates). + + Args: + reward (float) : amount of reward returned after previous action. + next_state (object): observation provided by the environment. + """ + pass + + @abstractmethod + def save(self, filename): + """Save model to file.""" + pass + + @abstractmethod + def save_parameter_settings(self, filename): + """Save parameter settings to file.""" + pass + + @abstractmethod + def set_as_best_model(self): + """Copy current model to best model.""" + pass + + def enter_evaluation(self): + """Setup before evaluation.""" + pass + + def exit_evaluation(self): + """Tear-down after evaluation.""" + pass + + def evaluate(self, o): + """ + Choose action for given observation without updating agent's status. + + Args: + o (object): observation provided by the environment. + + Returns: + action (int): action choosen by agent. + """ + a, _ = self._choose_action(self._preprocess_state(o)) + return a + + @abstractmethod + def _choose_action(self, state): + """ + Choose an action according to the policy. + + Args: + state (object): observation seen by agent, which can be different + from what is provided by the environment. The difference comes + from preprcessing. + + Returns: + action (int): action choosen by agent. + debug_info (str): auxiliary diagnostic information. + """ + pass + + def _discretize_observation_space(self, space, discretization_resolution): + if self._classname(space) == 'gym.spaces.box.Box': + self._space_discretizer = BoxSpaceDiscretizer( + space, + discretization_resolution) + self._discrete_observation_space = True + self._num_states = self._space_discretizer.num_states + self._shape_of_inputs = (self._num_states,) + else: + raise ValueError( + "Unsupported space type for discretization: {0}".format(space)) + + def _discretize_state_if_necessary(self, state): + if self._space_discretizer is not None: + return self._space_discretizer.discretize(state) + else: + return state + + def _index_to_vector(self, index, dimension): + # TODO: consider using cntk.core.Value.one_hot here. + a = np.zeros(dimension,) + a[index] = 1 + return a + + def _preprocess_state(self, state): + """Preprocess state to generate input to neural network. + + When state is a scalar which is the index of the state space, convert + it using one-hot encoding. + + For other cases, state and input are the same, roughly. + + CNTK only supports float32 and float64. Performs appropriate + type conversion as well. + """ + o = self._discretize_state_if_necessary(state) + if self._discrete_observation_space: + o = self._index_to_vector(o, self._num_states) + if self._preprocessor is not None: + o = self._preprocessor.preprocess(o) + # TODO: allow float64 dtype. + if o.dtype.name != 'float32': + o = o.astype(np.float32) + return o + + def _classname(self, instance): + return instance.__class__.__module__ + '.' + instance.__class__.__name__ + + def _import_method(self, path): + """Import method specified as module_name.method_name.""" + module_name, method_name = path.rsplit('.', 1) + try: + module = import_module(module_name) + method = getattr(module, method_name) + except (AttributeError, ImportError): + raise ValueError('Cannot import method: "{0}"'.format(path)) + return method diff --git a/bindings/python/cntk/contrib/deeprl/agent/agent_factory.py b/bindings/python/cntk/contrib/deeprl/agent/agent_factory.py new file mode 100644 index 000000000000..ae053a8f1d39 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/agent_factory.py @@ -0,0 +1,45 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Factory method to create an agent.""" + +import configparser + +from .policy_gradient import ActorCritic +from .qlearning import QLearning +from .random_agent import RandomAgent +from .tabular_qlearning import TabularQLearning + + +def make_agent(agent_config, o_space, a_space): + """ + Choose appropriate method to create an agent. + + Args: + agent_config: configure file specifying the agent type as well as + training details. + o_space: observation space, gym.spaces.tuple_space.Tuple is not + supported. + a_space: action space, limits to gym.spaces.discrete.Discrete. + + Returns: + subclass inherited from :class:`.agent.AgentBaseClass`: QLearning, + ActorCritic, TabularQLearning, or RandomAgent. + """ + config = configparser.ConfigParser() + config.read(agent_config) + + agent_type = config.get( + 'General', 'Agent', fallback='random').lower() + agent = None + if agent_type == 'qlearning': + agent = QLearning(agent_config, o_space, a_space) + elif agent_type == 'actor_critic': + agent = ActorCritic(agent_config, o_space, a_space) + elif agent_type == 'tabular_qlearning': + agent = TabularQLearning(agent_config, o_space, a_space) + elif agent_type == 'random': + agent = RandomAgent(o_space, a_space) + return agent diff --git a/bindings/python/cntk/contrib/deeprl/agent/policy_gradient.py b/bindings/python/cntk/contrib/deeprl/agent/policy_gradient.py new file mode 100644 index 000000000000..d65d4533167d --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/policy_gradient.py @@ -0,0 +1,373 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Actor-Critic Policy Gradient.""" + +import cntk as C +import numpy as np + +import ast + +from .agent import AgentBaseClass +from .shared.cntk_utils import negative_of_entropy_with_softmax +from .shared.models import Models +from .shared.policy_gradient_parameters import PolicyGradientParameters + + +class ActorCritic(AgentBaseClass): + """ + Actor-Critic Policy Gradient. + + See https://arxiv.org/pdf/1602.01783.pdf for a description of algorithm. + """ + + def __init__(self, config_filename, o_space, a_space): + """ + Constructor for policy gradient. + + Args: + config_filename: configure file specifying training details. + o_space: observation space, gym.spaces.tuple_space.Tuple is not + supported. + a_space: action space, limits to gym.spaces.discrete.Discrete. + """ + super(ActorCritic, self).__init__(o_space, a_space) + + self._parameters = PolicyGradientParameters(config_filename) + + # Create preprocessor. + if self._parameters.preprocessing: + preproc = self._import_method(self._parameters.preprocessing) + self._preprocessor = preproc( + self._shape_of_inputs, + *ast.literal_eval(self._parameters.preprocessing_args)) + + self._set_up_policy_network_and_value_network() + + self._trajectory_states = [] + self._trajectory_actions = [] + self._trajectory_rewards = [] + + # Training data for the policy and value networks. Note they share the + # same input. + self._input_buffer = [] + self._value_network_output_buffer = [] + self._policy_network_output_buffer = [] + self._policy_network_weight_buffer = [] + + self.episode_count = 0 + self.step_count = 0 + + def start(self, state): + """ + Start a new episode. + + Args: + state (object): observation provided by the environment. + + Returns: + action (int): action choosen by agent. + debug_info (dict): auxiliary diagnostic information. + """ + # Call _process_accumulated_trajectory() to process unused trajectory + # data from previous episode. + self._process_accumulated_trajectory(False) + + # Reset preprocessor. + if self._preprocessor is not None: + self._preprocessor.reset() + + # Append new state and action + o = self._preprocess_state(state) + action, _ = self._choose_action(o) + self._trajectory_states.append(o) + self._trajectory_actions.append(action) + + self.episode_count += 1 + + return action, {} + + def step(self, reward, next_state): + """ + Observe one transition and choose an action. + + Args: + reward (float) : amount of reward returned after previous action. + next_state (object): observation provided by the environment. + + Returns: + action (int): action choosen by agent. + debug_info (dict): auxiliary diagnostic information. + """ + o = self._preprocess_state(next_state) + self._trajectory_rewards.append(reward) + self._trajectory_states.append(o) + self.step_count += 1 + + # Update every self._parameters.update_frequency + if self.step_count % self._parameters.update_frequency == 0: + self._process_accumulated_trajectory(True) + self._update_networks() + + action, _ = self._choose_action(o) + self._trajectory_actions.append(action) + return action, {} + + def end(self, reward, next_state): + """ + Last observed reward/state of the episode (which then terminates). + + Args: + reward (float) : amount of reward returned after previous action. + next_state (object): observation provided by the environment. + """ + self._trajectory_rewards.append(reward) + self.step_count += 1 + + # Update every self._parameters.update_frequency + if self.step_count % self._parameters.update_frequency == 0: + self._process_accumulated_trajectory(False) + self._update_networks() + + def set_as_best_model(self): + """Copy current model to best model.""" + self._best_model = self._policy_network.clone('clone') + + def _set_up_policy_network_and_value_network(self): + shape_of_inputs = self._shape_of_inputs if self._preprocessor is None \ + else self._preprocessor.output_shape() + self._input_variables = \ + C.ops.input_variable(shape=shape_of_inputs, dtype=np.float32) + + # Set up policy network. + if self._parameters.policy_representation == 'nn': + model = Models.feedforward_network( + shape_of_inputs, + self._num_actions, + self._parameters.policy_network_hidden_layers, + C.losses.cross_entropy_with_softmax, + use_placeholder_for_input=True) + else: + try: + model_definition_function = self._import_method( + self._parameters.policy_representation) + model = model_definition_function( + shape_of_inputs, + self._num_actions, + C.losses.cross_entropy_with_softmax, + use_placeholder_for_input=True) + except ValueError: + raise ValueError( + 'Unknown representation for policy: "{0}"' + '\n'.format(self._parameters.policy_representation)) + + self._policy_network = model['f'] + self._policy_network.replace_placeholder(self._input_variables) + self._policy_network_output_variables = model['outputs'] + # The weight is computed as part of the Actor-Critic algorithm. + self._policy_network_weight_variables = \ + C.ops.input_variable(shape=(1,), dtype=np.float32) + self._policy_network_loss = \ + model['loss'] * self._policy_network_weight_variables + + # Initialized from a saved model. + if self._parameters.initial_policy_network: + self._policy_network.restore( + self._parameters.initial_policy_network) + + print("Parameterized the agent's policy using neural networks " + '"{0}" with {1} actions.\n' + ''.format(self._parameters.policy_representation, + self._num_actions)) + + # Set up value network. + if self._parameters.shared_representation: + # For shared representation, policy pi and value function V share + # all non-output layers. To use cross_entropy_with_softmax loss + # from cntk, _policy_network defined here doesn't include softmax + # output layer. Therefore _value_network becomes _policy_network + # plus one additional linear output layer. + self._value_network = C.layers.Dense(1, activation=None)( + self._policy_network) + self._value_network_output_variables = C.ops.input_variable( + shape=(1,), dtype=np.float32) + self._value_network_loss = C.losses.squared_error( + self._value_network, self._value_network_output_variables) + else: + if self._parameters.value_function_representation == 'nn': + model = Models.feedforward_network( + shape_of_inputs, + 1, # value network outputs a scalar + self._parameters.value_network_hidden_layers, + use_placeholder_for_input=True) + else: + try: + model_definition_function = self._import_method( + self._parameters.value_function_representation) + model = model_definition_function( + shape_of_inputs, + 1, # value network outputs a scalar + use_placeholder_for_input=True) + except ValueError: + raise ValueError( + 'Unknown representation for value function: "{0}"' + '\n'.format(self._parameters.value_function_representation)) + + self._value_network = model['f'] + self._value_network.replace_placeholder(self._input_variables) + self._value_network_output_variables = model['outputs'] + self._value_network_loss = model['loss'] # squared_error by default + + combined_networks = C.ops.combine( + [self._policy_network, self._value_network]) + combined_loss = self._policy_network_loss + \ + self._parameters.regularization_weight * \ + negative_of_entropy_with_softmax(self._policy_network) + \ + self._parameters.relative_step_size * self._value_network_loss + + # The learning rate will be updated later before each minibatch + # training. + # TODO: allow user to specify learner through config file. + self._trainer = C.train.trainer.Trainer( + combined_networks, + (combined_loss, None), + C.learners.adam( + combined_networks.parameters, + C.learners.learning_rate_schedule( + self._parameters.initial_eta, + C.learners.UnitType.sample), + momentum=C.learners.momentum_schedule(self._parameters.momentum), + variance_momentum=C.learners.momentum_schedule(0.999), + use_mean_gradient=True)) + + print("Parameterized the agent's value function using neural network " + '"{0}".\n'.format( + self._parameters.policy_representation + if self._parameters.shared_representation + else self._parameters.value_function_representation)) + + def _adjust_learning_rate(self): + if self._parameters.initial_eta != self._parameters.eta_minimum: + eta = self._parameters.eta_minimum + max( + 0, + (self._parameters.initial_eta - self._parameters.eta_minimum) * + (1 - float(self.step_count)/self._parameters.eta_decay_step_count)) + self._trainer.parameter_learners[0].reset_learning_rate( + C.learners.learning_rate_schedule( + eta, C.learners.UnitType.sample)) + + def _choose_action(self, state): + """ + Choose an action according to policy. + + Args: + state (object): observation seen by agent, which can be different + from what is provided by the environment. The difference comes + from preprcessing. + + Returns: + action (int): action choosen by agent. + debug_info (object): probability vector the action is sampled from. + """ + action_probs = \ + C.ops.softmax(self._evaluate_model(self._policy_network, state)).eval() + return np.random.choice(self._num_actions, p=action_probs), action_probs + + def save(self, filename): + """Save model to file.""" + self._best_model.save(filename) + + def save_parameter_settings(self, filename): + """Save parameter settings to file.""" + self._parameters.save(filename) + + def _evaluate_model(self, model, state): + r"""Evaluate log of pi(\cdot|state) or v(state).""" + return np.squeeze(model.eval({model.arguments[0]: [state]})) + + def _process_accumulated_trajectory(self, keep_last): + """Process accumulated trajectory to generate training data. + + Args: + keep_last (bool): last state without action and reward will be kept + if True. + """ + if not self._trajectory_states: + return + + # If trajectory hasn't terminated, we have _trajectory_states + # and sometimes _trajectory_actions having one more item than + # _trajectory_rewards. Same length is expected if called from + # start() or end(), where the trajectory has terminiated. + if len(self._trajectory_states) == len(self._trajectory_rewards): + bootstrap_r = 0 + else: + # Bootstrap from last state + bootstrap_r = np.asscalar(self._evaluate_model( + self._value_network, self._trajectory_states[-1])) + last_state = self._trajectory_states.pop() + if len(self._trajectory_actions) != len(self._trajectory_rewards): + # This will only happen when agent calls start() to begin + # a new episode without calling end() before to terminate the + # prevous episode. The last action thus can be discarded. + self._trajectory_actions.pop() + + if len(self._trajectory_states) != len(self._trajectory_rewards) or \ + len(self._trajectory_actions) != len(self._trajectory_rewards): + raise RuntimeError("Can't pair (state, action, reward). " + "state/action can only be one more step ahead " + "of rewrad in trajectory.") + + for transition in zip( + self._trajectory_states, + self._trajectory_actions, + self._discount_rewards(bootstrap_r)): + self._input_buffer.append(transition[0]) + self._value_network_output_buffer.append([transition[2]]) + # TODO: consider using cntk.ops.one_hot instead of _index_to_vector + self._policy_network_output_buffer.append( + self._index_to_vector(transition[1], self._num_actions)) + self._policy_network_weight_buffer.append([transition[2] + - self._evaluate_model(self._value_network, transition[0])]) + + # Clear the trajectory history. + self._trajectory_states = [] + self._trajectory_actions = [] + self._trajectory_rewards = [] + if keep_last: + self._trajectory_states.append(last_state) + + def _update_networks(self): + self._adjust_learning_rate() + + # Train the policy network on one minibatch. + self._trainer.train_minibatch( + { + self._input_variables: np.array(self._input_buffer).astype( + np.float32), + self._policy_network_output_variables: + np.array(self._policy_network_output_buffer).astype( + np.float32), + self._policy_network_weight_variables: + np.array(self._policy_network_weight_buffer).astype( + np.float32), + self._value_network_output_variables: + np.array(self._value_network_output_buffer).astype( + np.float32) + }) + + # Clear training data. + self._input_buffer = [] + self._value_network_output_buffer = [] + self._policy_network_output_buffer = [] + self._policy_network_weight_buffer = [] + + def _discount_rewards(self, bootstrap_r): + discounted_rewards = [0] * len(self._trajectory_rewards) + r = bootstrap_r + for t in reversed(range(len(self._trajectory_rewards))): + r = r * self._parameters.gamma + self._trajectory_rewards[t] + discounted_rewards[t] = r + return discounted_rewards diff --git a/bindings/python/cntk/contrib/deeprl/agent/qlearning.py b/bindings/python/cntk/contrib/deeprl/agent/qlearning.py new file mode 100644 index 000000000000..f90b3a8332dd --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/qlearning.py @@ -0,0 +1,381 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Deep Q-learning and its variants.""" + +import math + +import cntk as C +import numpy as np + +import ast + +from .agent import AgentBaseClass +from .shared.cntk_utils import huber_loss +from .shared.models import Models +from .shared.qlearning_parameters import QLearningParameters +from .shared.replay_memory import ReplayMemory + + +class QLearning(AgentBaseClass): + """ + Q-learning agent. + + Including: + - DQN https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf + - Prioritized Experience Replay https://arxiv.org/pdf/1511.05952.pdf + - Dueling Network https://arxiv.org/pdf/1511.06581.pdf + - Double Q Learning https://arxiv.org/pdf/1509.06461.pdf + """ + + def __init__(self, config_filename, o_space, a_space): + """Constructor for Q learning algorithm. + + Widely known as DQN. Use either predefined neural network structure + (see models.py) or customized network (see customized_models.py). + + Args: + config_filename: configure file specifying training details. + o_space: observation space, gym.spaces.tuple_space.Tuple is not + supported. + a_space: action space, limits to gym.spaces.discrete.Discrete. + """ + super(QLearning, self).__init__(o_space, a_space) + + self._parameters = QLearningParameters(config_filename) + + # Create preprocessor. + if self._parameters.preprocessing: + try: + preproc = self._import_method(self._parameters.preprocessing) + self._preprocessor = preproc( + self._shape_of_inputs, + *ast.literal_eval(self._parameters.preprocessing_args)) + except ValueError: + raise ValueError( + 'Unknown preprocessing method: "{0}"' + '\n'.format(self._parameters.preprocessing)) + + # Set up the Q-function. + shape_of_inputs = self._shape_of_inputs \ + if self._preprocessor is None \ + else self._preprocessor.output_shape() + if self._parameters.q_representation == 'dqn': + model = Models.feedforward_network( + shape_of_inputs, + self._num_actions, + self._parameters.hidden_layers, + huber_loss if self._parameters.use_error_clipping else None) + elif self._parameters.q_representation == 'dueling-dqn': + model = Models.dueling_network( + shape_of_inputs, + self._num_actions, + self._parameters.hidden_layers, + huber_loss if self._parameters.use_error_clipping else None) + else: + try: + model_definition_function = self._import_method( + self._parameters.q_representation) + model = model_definition_function( + shape_of_inputs, + self._num_actions, + huber_loss if self._parameters.use_error_clipping else None) + except ValueError: + raise ValueError( + 'Unknown representation for Q-learning: "{0}"' + '\n'.format(self._parameters.q_representation)) + + self._q = model['f'] + self._input_variables = model['inputs'] + self._output_variables = model['outputs'] + if self._parameters.use_prioritized_replay: + self._weight_variables = \ + C.ops.input_variable(shape=(1,), dtype=np.float32) + self._loss = model['loss'] * self._weight_variables + else: + self._loss = model['loss'] + + # If gradient_clipping_threshold_per_sample is inf, gradient clipping + # will not be performed. Set gradient_clipping_with_truncation to False + # to clip the norm. + # TODO: allow user to specify learner through config file. + opt = C.learners.adam( + self._q.parameters, + C.learners.learning_rate_schedule( + self._parameters.initial_eta, C.learners.UnitType.sample), + use_mean_gradient=True, + momentum=C.learners.momentum_schedule(self._parameters.momentum), + variance_momentum=C.learners.momentum_schedule(0.999), + gradient_clipping_threshold_per_sample= + self._parameters.gradient_clipping_threshold, + gradient_clipping_with_truncation=False) + self._trainer = C.train.trainer.Trainer( + self._q, (self._loss, None), opt) + + # Initialize target Q. + self._target_q = self._q.clone('clone') + + # Initialize replay memory. + self._replay_memory = ReplayMemory( + self._parameters.replay_memory_capacity, + self._parameters.use_prioritized_replay) + + print('Parameterized Q-learning agent using neural networks ' + '"{0}" with {1} actions.\n' + ''.format(self._parameters.q_representation, + self._num_actions)) + + self.episode_count = 0 + self.step_count = 0 + + def start(self, state): + """ + Start a new episode. + + Args: + state (object): observation provided by the environment. + + Returns: + action (int): action choosen by agent. + debug_info (dict): auxiliary diagnostic information. + """ + if self._preprocessor is not None: + self._preprocessor.reset() + + self._adjust_exploration_rate() + self._last_state = self._preprocess_state(state) + self._last_action, action_behavior = \ + self._choose_action(self._last_state) + self.episode_count += 1 + return self._last_action, { + 'action_behavior': action_behavior, + 'epsilon': self._epsilon} + + def step(self, reward, next_state): + """ + Observe one transition and choose an action. + + Args: + reward (float) : amount of reward returned after previous action. + next_state (object): observation provided by the environment. + + Returns: + action (int): action choosen by agent. + debug_info (dict): auxiliary diagnostic information. + """ + next_encoded_state = self._preprocess_state(next_state) + priority = self._compute_priority( + self._last_state, self._last_action, reward, next_encoded_state) + self._replay_memory.store( + self._last_state, + self._last_action, + reward, + next_encoded_state, + priority) + self.step_count += 1 + + # Update Q every self._parameters.q_update_frequency + self._update_q_periodically() + + self._adjust_exploration_rate() + self._last_state = next_encoded_state + self._last_action, action_behavior = self._choose_action( + self._last_state) + return self._last_action, { + 'action_behavior': action_behavior, + 'epsilon': self._epsilon} + + def end(self, reward, next_state): + """ + Last observed reward/state of the episode (which then terminates). + + Args: + reward (float) : amount of reward returned after previous action. + next_state (object): observation provided by the environment. + """ + priority = self._compute_priority( + self._last_state, self._last_action, reward, None) + self._replay_memory.store( + self._last_state, + self._last_action, + reward, + None, + priority) + self.step_count += 1 + + # Update Q every self._parameters.q_update_frequency + self._update_q_periodically() + + def set_as_best_model(self): + """Copy current model to best model.""" + self._best_model = self._q.clone('clone') + + def enter_evaluation(self): + """Setup before evaluation.""" + self._epsilon = 0 + + def _adjust_learning_rate(self): + if self._parameters.initial_eta != self._parameters.eta_minimum: + eta = self._parameters.eta_minimum + max( + 0, + (self._parameters.initial_eta - self._parameters.eta_minimum) * + (1 - float(self.step_count)/self._parameters.eta_decay_step_count)) + + self._trainer.parameter_learners[0].reset_learning_rate( + C.learners.learning_rate_schedule( + eta, C.learners.UnitType.sample)) + + def _adjust_exploration_rate(self): + self._epsilon = self._parameters.epsilon_minimum + max( + 0, + (self._parameters.initial_epsilon - self._parameters.epsilon_minimum) * + (1 - float(self.step_count)/self._parameters.epsilon_decay_step_count)) + + def _choose_action(self, state): + """ + Epsilon greedy policy. + + Args: + state (object): observation seen by agent, which can be different + from what is provided by the environment. The difference comes + from preprcessing. + + Returns: + action (int): action choosen by agent. + debug_info (str): auxiliary diagnostic information. + """ + if self.step_count < self._parameters.replay_start_size or \ + np.random.uniform(0, 1) < self._epsilon: + return np.random.randint(self._num_actions), 'RANDOM' + else: + return np.argmax(self._evaluate_q(self._q, state)), 'GREEDY' + + def save(self, filename): + """Save model to file.""" + self._best_model.save(filename) + + def save_parameter_settings(self, filename): + """Save parameter settings to file.""" + self._parameters.save(filename) + + def _evaluate_q(self, model, state, action=None): + """ + Evaluate Q[state, action]. + + If action is None, return values for all actions. + Args: + state (object): observation seen by agent, which can be different + from what is provided by the environment. The difference comes + from preprcessing. + action (int): action choosen by agent. + """ + q = np.squeeze(model.eval({model.arguments[0]: [state]})) + if action is None: + return q + else: + return q[action] + + def _update_q_periodically(self): + if self.step_count < self._parameters.replay_start_size or \ + self.step_count % self._parameters.q_update_frequency != 0: + return + + self._adjust_learning_rate() + for i in range(self._parameters.replays_per_update): + self._replay_and_update() + + # Clone target network periodically. + if self.step_count % \ + self._parameters.target_q_update_frequency == 0: + self._target_q = self._q.clone('clone') + + def _replay_and_update(self): + """Perform one minibatch update of Q.""" + input_values = [] + output_values = [] + if self._parameters.use_prioritized_replay: + # importance sampling weights. + weight_values = [] + + minibatch = self._replay_memory.sample_minibatch( + self._parameters.minibatch_size) + for index_transition_pair in minibatch: + input_value = index_transition_pair[1].state + + # output_value is the same for all actions except last_action. + output_value = self._evaluate_q( + self._q, index_transition_pair[1].state) + td_err = self._compute_td_err( + index_transition_pair[1].state, + index_transition_pair[1].action, + index_transition_pair[1].reward, + index_transition_pair[1].next_state) + output_value[index_transition_pair[1].action] += td_err + + input_values.append(input_value) + output_values.append(output_value) + + if self._parameters.use_prioritized_replay: + weight_values.append(math.pow( + index_transition_pair[1].priority, + -self._parameters.priority_beta)) + + if self._parameters.use_prioritized_replay: + w_sum = sum(weight_values) + weight_values = [[w / w_sum] for w in weight_values] + self._trainer.train_minibatch( + { + self._input_variables: np.array(input_values).astype( + np.float32), + self._output_variables: np.array(output_values).astype( + np.float32), + self._weight_variables: np.array(weight_values).astype( + np.float32) + }) + + # Update replay priority. + position_priority_map = {} + for index_transition_pair in minibatch: + position_priority_map[index_transition_pair[0]] = \ + self._compute_priority( + index_transition_pair[1].state, + index_transition_pair[1].action, + index_transition_pair[1].reward, + index_transition_pair[1].next_state) + + self._replay_memory.update_priority(position_priority_map) + else: + self._trainer.train_minibatch( + { + self._input_variables: np.array(input_values).astype( + np.float32), + self._output_variables: np.array(output_values).astype( + np.float32) + }) + + def _compute_td_err(self, state, action, reward, next_state): + td_err = reward + if next_state is not None: + if self._parameters.double_q_learning: + td_err += self._parameters.gamma * \ + self._evaluate_q( + self._target_q, + next_state, + np.argmax(self._evaluate_q(self._q, next_state))) + else: + td_err += self._parameters.gamma * np.max( + self._evaluate_q(self._target_q, next_state)) + td_err -= self._evaluate_q(self._q, state, action) + return td_err + + def _compute_priority(self, state, action, reward, next_state): + priority = None + if self._parameters.use_prioritized_replay: + priority = math.pow( + math.fabs(self._compute_td_err( + state, action, reward, next_state)) + + self._parameters.priority_epsilon, + self._parameters.priority_alpha) + return priority diff --git a/bindings/python/cntk/contrib/deeprl/agent/random_agent.py b/bindings/python/cntk/contrib/deeprl/agent/random_agent.py new file mode 100644 index 000000000000..a82a3be43391 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/random_agent.py @@ -0,0 +1,57 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Baseline agent that selects action uniformly randomly.""" + +import numpy as np + +from .agent import AgentBaseClass + + +class RandomAgent(AgentBaseClass): + """Agent that selects action uniformly randomly.""" + + def __init__(self, o_space, a_space): + """Constructor for RandomAgent.""" + super(RandomAgent, self).__init__(o_space, a_space) + + print('Initialized random agent with {0} actions.'.format( + self._num_actions)) + + self.episode_count = 0 + # step_count is incremented each time after receiving reward. + self.step_count = 0 + + def start(self, state): + """Start a new episode.""" + self.episode_count += 1 + action, _ = self._choose_action(state) + return action, {} + + def step(self, reward, next_state): + """Observe one transition and choose an action.""" + self.step_count += 1 + action, _ = self._choose_action(next_state) + return action, {} + + def end(self, reward, next_state): + """Last observed reward/state of the episode (which then terminates).""" + self.step_count += 1 + + def set_as_best_model(self): + """Copy current model to best model.""" + pass + + def save(self, filename): + """Save best model to file.""" + pass + + def save_parameter_settings(self, filename): + """Save parameter settings to file.""" + pass + + def _choose_action(self, state): + """Random policy.""" + return np.random.randint(self._num_actions), 'RANDOM' diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/__init__.py b/bindings/python/cntk/contrib/deeprl/agent/shared/__init__.py new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/shared/__init__.py @@ -0,0 +1 @@ + diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/cntk_utils.py b/bindings/python/cntk/contrib/deeprl/agent/shared/cntk_utils.py new file mode 100644 index 000000000000..478b9ed6d0e4 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/shared/cntk_utils.py @@ -0,0 +1,24 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Utility functions.""" + +import cntk.ops as C + + +def huber_loss(output, target): + r"""See https://en.wikipedia.org/wiki/Huber_loss for definition. + + \delta is set to 1. This is not the right definition if output and target + differ in more than one dimension. + """ + a = target - output + return C.reduce_sum(C.element_select( + C.less(C.abs(a), 1), C.square(a) * 0.5, C.abs(a) - 0.5)) + + +def negative_of_entropy_with_softmax(p): + """See https://en.wikipedia.org/wiki/Entropy_(information_theory).""" + return C.reduce_sum(C.softmax(p) * p) - C.reduce_log_sum_exp(p) diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/customized_models.py b/bindings/python/cntk/contrib/deeprl/agent/shared/customized_models.py new file mode 100644 index 000000000000..8e3985d9b4c8 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/shared/customized_models.py @@ -0,0 +1,71 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Customized Q function or (unnormalized) log of policy function. + +If models from cntk.contrib.deeprl.agent.shared.models are not adequate, write +your own model as a function, which takes two required arguments +'shape_of_inputs', 'number_of_outputs', and two optional arguments +'loss_function', 'use_placeholder_for_input', and outputs a dictionary +containing 'inputs', 'outputs', 'f' and 'loss'. In the config file, set +QRepresentation or PolicyRepresentation to path (module_name.function_name) of +the function. QLearning/PolicyGradient will then automatically search for it. +""" + +import cntk as C +import numpy as np + + +def conv_dqn(shape_of_inputs, + number_of_outputs, + loss_function=None, + use_placeholder_for_input=False): + """Example convolutional neural network for approximating the Q value function. + + This is the model used in the original DQN paper + https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf. + + Args: + shape_of_inputs: tuple of array (input) dimensions. + number_of_outputs: dimension of output, equals the number of + possible actions. + loss_function: if not specified, use squared loss by default. + use_placeholder_for_input: if true, inputs have to be replaced + later with actual input_variable. + + Returns: a Python dictionary with string-valued keys including + 'inputs', 'outputs', 'loss' and 'f'. + """ + # input/output + inputs = C.ops.placeholder(shape=shape_of_inputs) \ + if use_placeholder_for_input \ + else C.ops.input_variable(shape=shape_of_inputs, dtype=np.float32) + outputs = C.ops.input_variable( + shape=(number_of_outputs,), dtype=np.float32) + + # network structure + centered_inputs = inputs - 128 + scaled_inputs = centered_inputs / 256 + + with C.layers.default_options(activation=C.ops.relu): + q = C.layers.Sequential([ + C.layers.Convolution((8, 8), 32, strides=4), + C.layers.Convolution((4, 4), 64, strides=2), + C.layers.Convolution((3, 3), 64, strides=2), + C.layers.Dense((512,)), + C.layers.Dense(number_of_outputs, activation=None) + ])(scaled_inputs) + + if loss_function is None: + loss = C.losses.squared_error(q, outputs) + else: + loss = loss_function(q, outputs) + + return { + 'inputs': inputs, + 'outputs': outputs, + 'f': q, + 'loss': loss + } diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/discretize.py b/bindings/python/cntk/contrib/deeprl/agent/shared/discretize.py new file mode 100644 index 000000000000..3bfe575290fe --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/shared/discretize.py @@ -0,0 +1,52 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Discretize continuous environment space.""" + +import numpy as np + + +class BoxSpaceDiscretizer: + """Discretize Box space.""" + + def __init__(self, space, resolution): + spaceclassname = \ + space.__class__.__module__ + '.' + space.__class__.__name__ + if spaceclassname != 'gym.spaces.box.Box': + raise ValueError( + 'Space {0} incompatible with {1}. (Only supports ' + 'Box space)'.format(space, self)) + + assert np.isscalar(resolution) or space.low.shape == resolution.shape + + self._state_mins = space.low + self._state_maxs = space.high + if np.isscalar(resolution): + self._state_resolutions = resolution + np.zeros(space.low.shape) + else: + self._state_resolutions = resolution + self.num_states = int(np.prod(self._state_resolutions)) + + def discretize(self, value): + """Discretize box space observation.""" + index = 0 + for i, v in np.ndenumerate(value): + i_idx = self._get_index( + v, + self._state_mins[i], + self._state_maxs[i], + self._state_resolutions[i]) + index = index * self._state_resolutions[i] + i_idx + return int(index) + + def _get_index(self, value, minv, maxv, res): + """Convert a continuous value to a discrete number.""" + if value >= maxv: + return res - 1 + elif value <= minv: + return 0 + else: + ind = np.floor((value - minv) * res / (maxv - minv)) + return int(min(res - 1, max(0, ind))) diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/models.py b/bindings/python/cntk/contrib/deeprl/agent/shared/models.py new file mode 100644 index 000000000000..52fc0a22c32b --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/shared/models.py @@ -0,0 +1,156 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""A set of predefined models used by Q learning or Actor-Critic.""" + +import cntk as C +import numpy as np + +import ast + + +class Models: + """A set of predefined models to approximate Q or log of pi (policy). + + The loss function needs to be 'cross_entropy_with_softmax' for policy + gradient methods. + """ + + @staticmethod + def feedforward_network(shape_of_inputs, + number_of_outputs, + model_hidden_layers, + loss_function=None, + use_placeholder_for_input=False): + """Feedforward network to approximate Q or log of pi. + + Args: + shape_of_inputs: tuple of array (input) dimensions. + number_of_outputs: dimension of output, equals the number of + possible actions. + model_hidden_layers: string representing a list of integers + corresponding to number of nodes in each hidden layer. + loss_function: if not specified, use squared loss by default. + use_placeholder_for_input: if true, inputs have to be replaced + later with actual input_variable. + + Returns: a Python dictionary with string valued keys including + 'inputs', 'outputs', 'loss' and 'f'. + """ + # input/output + inputs = C.ops.placeholder(shape=shape_of_inputs) \ + if use_placeholder_for_input \ + else C.ops.input_variable(shape=shape_of_inputs, dtype=np.float32) + outputs = C.ops.input_variable(shape=(number_of_outputs,), dtype=np.float32) + + # network structure + hidden_layers = ast.literal_eval(model_hidden_layers) + f = C.layers.Sequential([ + C.layers.For(range(len(hidden_layers)), + lambda h: C.layers.Dense(hidden_layers[h], activation=C.ops.relu)), + C.layers.Dense(number_of_outputs, activation=None) + ])(inputs) + + if loss_function is None: + loss = C.losses.squared_error(f, outputs) + else: + loss = loss_function(f, outputs) + + return { + 'inputs': inputs, + 'outputs': outputs, + 'f': f, + 'loss': loss + } + + @staticmethod + def dueling_network(shape_of_inputs, + number_of_outputs, + model_hidden_layers, + loss_function=None, + use_placeholder_for_input=False): + """Dueling network to approximate Q function. + + See paper at https://arxiv.org/pdf/1511.06581.pdf. + + Args: + shape_of_inputs: tuple of array (input) dimensions. + number_of_outputs: dimension of output, equals the number of + possible actions. + model_hidden_layers: in the form of "[comma-separated integers, + [comma-separated integers], [comma-separated integers]]". Each + integer is the number of nodes in a hidden layer.The + first set of integers represent the shared component in dueling + network. The second set correponds to the state value function + V and the third set correponds to the advantage function A. + loss_function: if not specified, use squared loss by default. + use_placeholder_for_input: if true, inputs have to be replaced + later with actual input_variable. + + Returns: a Python dictionary with string-valued keys including + 'inputs', 'outputs', 'loss' and 'f'. + """ + # input/output + inputs = C.ops.placeholder(shape=shape_of_inputs) \ + if use_placeholder_for_input \ + else C.ops.input_variable(shape=shape_of_inputs, dtype=np.float32) + outputs = C.ops.input_variable( + shape=(number_of_outputs,), dtype=np.float32) + + # network structure + shared_hidden_layers, v_hidden_layers, a_hidden_layers =\ + Models._parse_dueling_network_structure(model_hidden_layers) + # shared layers + s = C.layers.For( + range(len(shared_hidden_layers)), + lambda h: C.layers.Dense(shared_hidden_layers[h], activation=C.ops.relu))(inputs) + # Value function + v = C.layers.Sequential([ + C.layers.For( + range(len(v_hidden_layers)), + lambda h: C.layers.Dense(v_hidden_layers[h], activation=C.ops.relu)), + C.layers.Dense(1, activation=None) + ])(s) + # Advantage function + a = C.layers.Sequential([ + C.layers.For( + range(len(a_hidden_layers)), + lambda h: C.layers.Dense(a_hidden_layers[h], activation=C.ops.relu)), + C.layers.Dense(number_of_outputs, activation=None) + ])(s) + # Q = V + A - avg(A) + avg_a = C.layers.AveragePooling((number_of_outputs,))(a) + q = v + a - avg_a + + if loss_function is None: + loss = C.losses.squared_error(q, outputs) + else: + loss = loss_function(q, outputs) + + return { + 'inputs': inputs, + 'outputs': outputs, + 'f': q, + 'loss': loss + } + + @staticmethod + def _parse_dueling_network_structure(hidden_layers_str): + hidden_layers = ast.literal_eval(hidden_layers_str) + + if not ( + len(hidden_layers) > 2 + and isinstance(hidden_layers[-1], list) + and isinstance(hidden_layers[-2], list)): + raise ValueError('Invalid dueling network structure.') + + return\ + Models._remove_none_elements_from_list(hidden_layers[:-2]),\ + Models._remove_none_elements_from_list(hidden_layers[-2]),\ + Models._remove_none_elements_from_list(hidden_layers[-1]) + + @staticmethod + def _remove_none_elements_from_list(value_list): + return [e for e in value_list if e is not None] diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/policy_gradient_parameters.py b/bindings/python/cntk/contrib/deeprl/agent/shared/policy_gradient_parameters.py new file mode 100644 index 000000000000..4338e74e242c --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/shared/policy_gradient_parameters.py @@ -0,0 +1,113 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Policy Gradient parameters.""" + +import configparser + + +class PolicyGradientParameters: + """Parameters used by Policy Gradient algorithms.""" + + def __init__(self, config_file): + """Read parameter values from config_file. + + Use default value if the parameter is not present. + """ + self.config = configparser.ConfigParser() + self.config.optionxform = str + self.config.read(config_file) + + # Discount factor. + self.gamma = self.config.getfloat( + 'General', 'Gamma', fallback=0.95) + + # Name of class that does preprocessing. + self.preprocessing = self.config.get( + 'General', 'PreProcessing', fallback='') + + # Arguments (except the first argument input_shape) of preprocessing as + # a tuple. + self.preprocessing_args = self.config.get( + 'General', 'PreProcessingArgs', fallback='()') + + # If true, policy pi and value function V share all non-output layers. + # PolicyRepresentation (and/or PolicyNetworkHiddenLayers) define + # structure for all non-output layers. Policy then has one softmax + # output layer, and value function has one linear output layer. If + # false, all non-output layers of policy are still specified by + # PolicyRepresentation. This is equivalent to defining unnormalized log + # of policy pi. The value function, however, is completely specified by + # ValueFunctionRepresentation (and/or ValueNetworkHiddenLayers), which + # outputs a scalar. + self.shared_representation = self.config.getboolean( + 'PolicyGradient', 'SharedRepresentation', fallback=False) + + # Representation of policy. + self.policy_representation = self.config.get( + 'PolicyGradient', 'PolicyRepresentation', fallback='nn') + + # Suppose gradient of policy network is g, gradient of value network + # is gv, during each update, policy network is updated as + # \theta <- \theta + \eta * g where \eta is learning rate, and + # value network is updated as + # \theta_v <- \theta_v + \eta * relative_step_size * gv. This allows + # policy network and value network to be updated at different learning + # rates. Alternatively, this can be viewed as relative weight between + # policy loss and value function loss. + self.relative_step_size = self.config.getfloat( + 'PolicyGradient', 'RelativeStepSize', fallback=0.5) + + # Weight of regularization term. + self.regularization_weight = self.config.getfloat( + 'PolicyGradient', 'RegularizationWeight', fallback=0.001) + + # Number of nodes in each hidden layer of policy network. + self.policy_network_hidden_layers = self.config.get( + 'NetworkModel', 'PolicyNetworkHiddenLayerNodes', fallback='[10]') + + # Representation of value function. + self.value_function_representation = self.config.get( + 'PolicyGradient', 'ValueFunctionRepresentation', fallback='nn') + + # Number of nodes in each hidden layer of value network. + self.value_network_hidden_layers = self.config.get( + 'NetworkModel', 'ValueNetworkHiddenLayerNodes', fallback='[10]') + + # Initial value of eta, which is the learning rate for gradient descent. + self.initial_eta = self.config.getfloat( + 'Optimization', 'InitialEta', fallback=0.001) + + # Number of steps before eta reaches minimum value. + self.eta_decay_step_count = self.config.getint( + 'Optimization', 'EtaDecayStepCount', fallback=100000) + + # Minimum value of eta. Since Adam is used as the optimizer, a good + # starting point is to set EtaMinimum equal to InitialEta, which is + # equivalent to using a constant global learning rate cap, while Adam + # continuously adapts individual parameter learning rates. + self.eta_minimum = self.config.getfloat( + 'Optimization', 'EtaMinimum', fallback=0.001) + + # Momentum used by Adam. + self.momentum = self.config.getfloat( + 'Optimization', 'Momentum', fallback=0.95) + + # Update frequency for policy network and value network, in the number + # of time steps. + self.update_frequency = self.config.getint( + 'PolicyGradient', 'UpdateFrequency', fallback=64) + + # Name of a file containing model of the same structure as policy + # network (unnormalized log of policy pi), where model is obtained + # through other methods (e.g. supervised learning), and saved by + # cntk.ops.functions.Function.save(). Random initialization is + # performed if value is empty. + self.initial_policy_network = self.config.get( + 'PolicyGradient', 'InitialPolicy', fallback='') + + def save(self, config_file): + with open(config_file, 'w') as c: + self.config.write(c) diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/preprocessing.py b/bindings/python/cntk/contrib/deeprl/agent/shared/preprocessing.py new file mode 100644 index 000000000000..6db68521e81c --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/shared/preprocessing.py @@ -0,0 +1,125 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Base class for defining preprocessing, as well as two concrete examples.""" + +from abc import ABCMeta, abstractmethod +from collections import deque + +import numpy as np +from PIL import Image + + +class Preprocessing(object): + """Base class for defining preprocessing. + + All subclass constructors will take input_shape as the first argument. + """ + + __metaclass__ = ABCMeta + + def __init__(self, input_shape): + """Constructor for base Preprocessing class.""" + self._input_shape = input_shape + + @abstractmethod + def output_shape(self): + """Return shape of preprocessed observation.""" + pass + + @abstractmethod + def reset(self): + """Reset preprocessing pipeline for new episode.""" + pass + + @abstractmethod + def preprocess(self, observation): + """Return preprocessed observation.""" + pass + + +class AtariPreprocessing(Preprocessing): + """Preprocess screen images from Atari 2600 games. + + The image is represented by an array of shape (210, 160, 3). See + https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf + for more details. + """ + + def __init__(self, input_shape, history_len=4): + super(AtariPreprocessing, self).__init__(input_shape) + self.__history_len = history_len + self.__processed_image_seq = deque(maxlen=history_len) + self.reset() + + def output_shape(self): + """Return shape of preprocessed Atari images.""" + return (self.__history_len, 84, 84) + + def reset(self): + """Reset preprocessing pipeline for new episode.""" + self.__previous_raw_image = np.zeros(self._input_shape, dtype=np.uint8) + self.__processed_image_seq.clear() + for i in range(self.__history_len): + self.__processed_image_seq.append(np.zeros((84, 84))) + + def preprocess(self, image): + """Return preprocessed screen images from Atari 2600 games.""" + if image.shape != self._input_shape: + raise ValueError( + 'Expecting image in shape {0} but get {1}\n'.format( + self._input_shape, image.shape)) + + # Take the maximum value for each pixel over the current frame and the + # previous one. + im = Image.fromarray( + np.maximum(image, self.__previous_raw_image), mode='RGB') + + # Extract luminance band. + im = im.convert('YCbCr').split()[0] + + # Scale to 84 x 84 + im = im.resize((84, 84), Image.BILINEAR) + + self.__processed_image_seq.append(np.array(im)) + self.__previous_raw_image = image + + return np.stack(list(self.__processed_image_seq)) + + +class SlidingWindow(Preprocessing): + """Stack windowed inputs (x(t-m+1), ... x(t)).""" + + def __init__(self, input_shape, history_len=4, dtype=np.float32): + super(SlidingWindow, self).__init__(input_shape) + self.__dtype = dtype + self.__history_len = history_len + self.__history = deque(maxlen=history_len) + self.reset() + + def output_shape(self): + """Return shape of preprocessed input.""" + return (self.__history_len,) + self._input_shape + + def reset(self): + """Reset preprocessing pipeline for new episode.""" + self.__history.clear() + for i in range(self.__history_len): + self.__history.append(np.zeros(self._input_shape, self.__dtype)) + + def preprocess(self, x): + """Return preprocessed input x.""" + if x.shape != self._input_shape: + raise ValueError( + 'Expecting input in shape {0} but get {1}\n'.format( + self._input_shape, x.shape)) + + if x.dtype != self.__dtype: + raise ValueError( + 'Expecting input in dtype {0} but get {1}\n'.format( + self.__dtype, x.dtype)) + + self.__history.append(x) + return np.stack(list(self.__history)) diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/qlearning_parameters.py b/bindings/python/cntk/contrib/deeprl/agent/shared/qlearning_parameters.py new file mode 100644 index 000000000000..bf4bcb0efb9d --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/shared/qlearning_parameters.py @@ -0,0 +1,155 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Q learning parameters.""" + +import numpy as np + +import ast +import configparser + + +class QLearningParameters: + """Parameters used by Q learning algorithm.""" + + def __init__(self, config_file): + """Read parameter values from config_file. + + Use default value if the value is not present. + """ + # TODO: validate parameter values. + self.config = configparser.ConfigParser() + self.config.optionxform = str + self.config.read(config_file) + + # Discount factor + self.gamma = self.config.getfloat( + 'General', 'Gamma', fallback=0.95) + + # Name of class that does preprocessing. + self.preprocessing = self.config.get( + 'General', 'PreProcessing', fallback='') + + # Arguments (except the first argument input_shape) of preprocessing as + # a tuple. + self.preprocessing_args = self.config.get( + 'General', 'PreProcessingArgs', fallback='()') + + # Representation of Q function, taking value from {'tabular', 'nn'}. + self.q_representation = self.config.get( + 'QLearningAlgo', 'QRepresentation', fallback='tabular') + + # Initial value of epsilon (exploration rate), used by epsilon-greedy + # policy. + self.initial_epsilon = self.config.getfloat( + 'QLearningAlgo', 'InitialEpsilon', fallback=0.1) + + # Number of steps before epsilon reaches minimum value. + self.epsilon_decay_step_count = self.config.getint( + 'QLearningAlgo', 'EpsilonDecayStepCount', fallback=100000) + + # Minimum value of epsilon. + self.epsilon_minimum = self.config.getfloat( + 'QLearningAlgo', 'EpsilonMinimum', fallback=0.01) + + # Initial value of eta, which is the learning rate for gradient + # descent. + self.initial_eta = self.config.getfloat( + 'Optimization', 'InitialEta', fallback=0.001) + + # Number of steps before eta reaches minimum value. + self.eta_decay_step_count = self.config.getint( + 'Optimization', 'EtaDecayStepCount', fallback=100000) + + # Minimum value of eta. Since Adam is used as the optimizer, a good + # starting point is to set EtaMinimum equal to InitialEta, which is + # equivalent to using a constant learning rate. + self.eta_minimum = self.config.getfloat( + 'Optimization', 'EtaMinimum', fallback=0.001) + + # Momentum used by RMSProp. + self.momentum = self.config.getfloat( + 'Optimization', 'Momentum', fallback=0.95) + + # Initial value for table entries. + # TODO(maoyi): allow DQN initialization through config file. + self.initial_q = self.config.getfloat( + 'QLearningAlgo', 'InitialQ', fallback=0.0) + + # Number of partitions for discretizing the continuous space. Either a + # scalar which is applied to all dimensions, or a list specifying + # different value for different dimension. + self.discretization_resolution = ast.literal_eval(self.config.get( + 'QLearningAlgo', 'DiscretizationResolution', fallback='10')) + if isinstance(self.discretization_resolution, list): + self.discretization_resolution = np.array( + self.discretization_resolution) + + # Number of actions chosen between successive + # target network updates. + self.target_q_update_frequency = self.config.getint( + 'QLearningAlgo', 'TargetQUpdateFrequency', fallback=10000) + + # Sample size of each minibatch. + self.minibatch_size = self.config.getint( + 'QLearningAlgo', 'MinibatchSize', fallback=32) + + # Number of replays per update. + self.replays_per_update = self.config.getint( + 'QLearningAlgo', 'ReplaysPerUpdate', fallback=1) + + # Number of actions chosen between successive SGD updates of Q. + self.q_update_frequency = self.config.getint( + 'QLearningAlgo', 'QUpdateFrequency', fallback=4) + + # Use Huber loss with \delta=1 when True. Otherwise, use least square + # loss. + self.use_error_clipping = self.config.getboolean( + 'QLearningAlgo', 'ErrorClipping', fallback=True) + + # Capacity of replay memory. + self.replay_memory_capacity = self.config.getint( + 'ExperienceReplay', 'Capacity', fallback=100000) + + # A uniform random policy is run for this number of steps to populate + # replay memory. + self.replay_start_size = self.config.getint( + 'ExperienceReplay', 'StartSize', fallback=5000) + + # Use prioritized replay. Fall back to uniform sampling when False . + self.use_prioritized_replay = self.config.getboolean( + 'ExperienceReplay', 'Prioritized', fallback=False) + + # Used by prioritized replay, to determine how much prioritization is + # used, with 0 corresponding to uniform. + self.priority_alpha = self.config.getfloat( + 'ExperienceReplay', 'PriorityAlpha', fallback=0.7) + + # Used by prioritized replay, to anneal the amount of importance + # sampling correction. + self.priority_beta = self.config.getfloat( + 'ExperienceReplay', 'PriorityBeta', fallback=0.5) + + # Used by prioritized replay, to prevent transitions not being visited + # once their error is zero. + self.priority_epsilon = self.config.getfloat( + 'ExperienceReplay', 'PriorityEpsilon', fallback=0.01) + + # Number of nodes in each hidden layer, starting after the input layer. + self.hidden_layers = self.config.get( + 'NetworkModel', 'HiddenLayerNodes', fallback='[20]') + + # Maximum norm of gradient per sample. No gradient clipping if the + # parameter is missing from the config file. + self.gradient_clipping_threshold = self.config.getfloat( + 'Optimization', 'GradientClippingThreshold', fallback=np.inf) + + # Use Double Q-learning if true. + self.double_q_learning = self.config.getboolean( + 'QLearningAlgo', 'DoubleQLearning', fallback=False) + + def save(self, config_file): + with open(config_file, 'w') as c: + self.config.write(c) diff --git a/bindings/python/cntk/contrib/deeprl/agent/shared/replay_memory.py b/bindings/python/cntk/contrib/deeprl/agent/shared/replay_memory.py new file mode 100644 index 000000000000..8068bfcd1466 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/shared/replay_memory.py @@ -0,0 +1,163 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Replay memory for Q learning.""" + +from __future__ import division + +import math +import random +from collections import namedtuple + +# Transition for experience replay. +# +# Args: +# state: current state. +# action: action applied to current state. +# reward: scalar representing reward received by applying action to +# current state. +# next_state: the new state after action is applied. +# priority: associated priority. +_Transition = namedtuple('Transition', + ['state', 'action', 'reward', 'next_state', + 'priority']) + + +class ReplayMemory: + """Replay memory to store samples of experience. + + Each transition is represented as (state, action, reward, next_state, + priority) tuple. 'priority' is ignored for non-prioritized experience + replay. + """ + + def __init__(self, capacity, prioritized=False): + """Create replay memory with size capacity.""" + self._use_prioritized_replay = prioritized + self._capacity = capacity + # Position in the list where new experience will be written to. + self._position = 0 + # For prioritized replay, 'sum-tree' data structure is used. + # Transitions are stored in leaf nodes, while internal nodes store the + # sum of priorities from all its descendants. List is used to represent + # this complete binary tree. The following code initializes + # all internal nodes, if any, to have value 0. + self._memory = [0] * (capacity - 1) if prioritized else [] + + def store(self, *args): + """Store a transition in replay memory. + + If the memory is full, the oldest one gets overwritten. + """ + if not self._isfull(): + self._memory.append(None) + position = self._next_position_then_increment() + old_priority = 0 if self._memory[position] is None \ + else self._memory[position].priority + transition = _Transition(*args) + self._memory[position] = transition + if self._use_prioritized_replay: + self._update_internal_nodes( + position, transition.priority - old_priority) + + def update_priority(self, map_from_position_to_priority): + """Update priority of transitions. + + Args: + map_from_position_to_priority: dictionary mapping position of + transition to its new priority. position should come from + tuples returned by sample_minibatch(). + """ + if not self._use_prioritized_replay: + return + for position, new_priority in map_from_position_to_priority.items(): + old_priority = self._memory[position].priority + self._memory[position] = _Transition( + self._memory[position].state, + self._memory[position].action, + self._memory[position].reward, + self._memory[position].next_state, + new_priority) + self._update_internal_nodes( + position, new_priority - old_priority) + + def _actual_capacity(self): + """Actual capacity needed. + + For prioritized replay, this includes both leaf nodes containing + transitions and internal nodes containing priority sum. + """ + return 2 * self._capacity - 1 \ + if self._use_prioritized_replay \ + else self._capacity + + def _isfull(self): + return len(self._memory) == self._actual_capacity() + + def _next_position_then_increment(self): + """Similar to position++.""" + start = self._capacity - 1 \ + if self._use_prioritized_replay \ + else 0 + position = start + self._position + self._position = (self._position + 1) % self._capacity + return position + + def _update_internal_nodes(self, index, delta): + """Update internal priority sums when leaf priority has been changed. + + Args: + index: leaf node index + delta: change in priority + """ + while index > 0: + index = (index - 1) // 2 + self._memory[index] += delta + + def size(self): + """Return the current number of transitions.""" + l = len(self._memory) + if self._use_prioritized_replay: + l -= (self._capacity - 1) + return l + + def sample_minibatch(self, batch_size): + """Sample minibatch of size batch_size.""" + pool_size = self.size() + if pool_size == 0: + return [] + + if not self._use_prioritized_replay: + chosen_idx = range(pool_size) \ + if pool_size <= batch_size \ + else random.sample(range(pool_size), batch_size) + else: + delta_p = self._memory[0] / batch_size + chosen_idx = [] + for i in range(batch_size): + lower = max(i * delta_p, 0) + upper = min((i + 1) * delta_p, self._memory[0]) + p = random.uniform(lower, upper) + chosen_idx.append(self._sample_with_priority(p)) + + return [(i, self._memory[i]) for i in chosen_idx] + + def _sample_with_priority(self, p): + parent = 0 + while True: + left = 2 * parent + 1 + if left >= len(self._memory): + # parent points to a leaf node already. + return parent + + left_p = self._memory[left] if left < self._capacity - 1 \ + else self._memory[left].priority + if p <= left_p: + parent = left + else: + if left + 1 >= len(self._memory): + raise RuntimeError('Right child is expected to exist.') + p -= left_p + parent = left + 1 diff --git a/bindings/python/cntk/contrib/deeprl/agent/tabular_qlearning.py b/bindings/python/cntk/contrib/deeprl/agent/tabular_qlearning.py new file mode 100644 index 000000000000..a0790ee5cf04 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/agent/tabular_qlearning.py @@ -0,0 +1,121 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== +"""Tabular Q-learning.""" + +import copy + +import numpy as np + +from .agent import AgentBaseClass +from .shared.qlearning_parameters import QLearningParameters + + +class TabularQLearning(AgentBaseClass): + """Q-learning agent with tabular representation.""" + + def __init__(self, cfg_filename, o_space, a_space): + """Constructor for Q learning algorithm with tabular representation.""" + super(TabularQLearning, self).__init__(o_space, a_space) + + self._parameters = QLearningParameters(cfg_filename) + if self._parameters.q_representation != 'tabular': + raise ValueError( + 'Unexpected representation for tabular Q-learning: "{0}"' + '\n'.format(self._parameters.q_representation)) + + # Discretize the observation space if necessary + if self._classname(o_space) != 'gym.spaces.discrete.Discrete': + self._discretize_observation_space( + o_space, self._parameters.discretization_resolution) + + self._q = self._parameters.initial_q + \ + np.zeros((self._num_states, self._num_actions)) + print('Initialized discrete Q-learning agent with {0} states and ' + '{1} actions.'.format(self._num_states, self._num_actions)) + + self.episode_count = 0 + # step_count is incremented each time after receiving reward. + self.step_count = 0 + + def start(self, state): + """Start a new episode.""" + self._adjust_exploration_rate() + self._last_state = self._preprocess_state(state) + self._last_action, action_behavior = \ + self._choose_action(self._last_state) + self.episode_count += 1 + return self._last_action, { + 'action_behavior': action_behavior, + 'epsilon': self._epsilon} + + def step(self, reward, next_state): + """Observe one transition and choose an action.""" + self._adjust_learning_rate() + self.step_count += 1 + + next_encoded_state = self._preprocess_state(next_state) + td_err = reward + self._parameters.gamma * \ + np.max(self._q[next_encoded_state]) - \ + self._q[self._last_state, self._last_action] + self._q[self._last_state, self._last_action] += self._eta * td_err + + self._adjust_exploration_rate() + self._last_state = next_encoded_state + self._last_action, action_behavior = self._choose_action( + self._last_state) + return self._last_action, { + 'action_behavior': action_behavior, + 'epsilon': self._epsilon} + + def end(self, reward, next_state): + """Last observed reward/state of the episode (which then terminates).""" + self._adjust_learning_rate() + self.step_count += 1 + + td_err = reward - self._q[self._last_state, self._last_action] + self._q[self._last_state, self._last_action] += self._eta * td_err + + def set_as_best_model(self): + """Copy current model to best model.""" + self._best_model = copy.deepcopy(self._q) + + def save(self, filename): + """Save best model to file.""" + with open(filename, 'w') as f: + for s in range(self._num_states): + f.write('{0}\t{1}\n'.format(s, str(self._best_model[s]))) + + def save_parameter_settings(self, filename): + """Save parameter settings to file.""" + self._parameters.save(filename) + + def enter_evaluation(self): + """Setup before evaluation.""" + self._epsilon = 0 + + def _adjust_learning_rate(self): + self._eta = self._parameters.eta_minimum + max( + 0, + (self._parameters.initial_eta - self._parameters.eta_minimum) * + (1 - float(self.step_count)/self._parameters.eta_decay_step_count)) + + def _adjust_exploration_rate(self): + self._epsilon = self._parameters.epsilon_minimum + max( + 0, + (self._parameters.initial_epsilon - self._parameters.epsilon_minimum) * + (1 - float(self.step_count)/self._parameters.epsilon_decay_step_count)) + + def _choose_action(self, state): + """Epsilon greedy policy.""" + if np.random.uniform(0, 1) < self._epsilon: + return np.random.randint(self._num_actions), 'RANDOM' + else: + return np.argmax(self._q[state]), 'GREEDY' + + def _preprocess_state(self, state): + """Discretize state to table row index.""" + o = self._discretize_state_if_necessary(state) + return o diff --git a/bindings/python/cntk/contrib/deeprl/tests/agent_test.py b/bindings/python/cntk/contrib/deeprl/tests/agent_test.py new file mode 100644 index 000000000000..a44c200fbe0e --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/tests/agent_test.py @@ -0,0 +1,63 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import unittest + +import cntk.contrib.deeprl.tests.spaces as spaces +from cntk.contrib.deeprl.agent.agent import AgentBaseClass + + +class AgentBaseClassTest(unittest.TestCase): + """Unit tests for AgentBaseClass.""" + + def test_init_unsupported_action_space(self): + action_space = spaces.Box(0, 1, (1,)) + observation_space = spaces.Discrete(3) + self.assertRaises( + ValueError, AgentBaseClass, observation_space, action_space) + + def test_init_unsupported_observation_space(self): + action_space = spaces.Discrete(2) + observation_space = spaces.Tuple( + [spaces.Discrete(3), spaces.Discrete(3)]) + self.assertRaises( + ValueError, AgentBaseClass, observation_space, action_space) + + def test_init_discrete_observation_space(self): + action_space = spaces.Discrete(2) + observation_space = spaces.Discrete(3) + sut = AgentBaseClass(observation_space, action_space) + + self.assertEqual(sut._num_actions, 2) + self.assertEqual(sut._num_states, 3) + self.assertEqual(sut._shape_of_inputs, (3, )) + self.assertTrue(sut._discrete_observation_space) + self.assertIsNone(sut._space_discretizer) + self.assertIsNone(sut._preprocessor) + + def test_init_multibinary_observation_space(self): + action_space = spaces.Discrete(2) + observation_space = spaces.MultiBinary(3) + sut = AgentBaseClass(observation_space, action_space) + + self.assertEqual(sut._num_actions, 2) + self.assertIsNone(sut._num_states) + self.assertEqual(sut._shape_of_inputs, (3, )) + self.assertFalse(sut._discrete_observation_space) + self.assertIsNone(sut._space_discretizer) + self.assertIsNone(sut._preprocessor) + + def test_init_box_observation_space(self): + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = AgentBaseClass(observation_space, action_space) + + self.assertEqual(sut._num_actions, 2) + self.assertIsNone(sut._num_states) + self.assertEqual(sut._shape_of_inputs, (1, )) + self.assertFalse(sut._discrete_observation_space) + self.assertIsNone(sut._space_discretizer) + self.assertIsNone(sut._preprocessor) diff --git a/bindings/python/cntk/contrib/deeprl/tests/cntk_utils_test.py b/bindings/python/cntk/contrib/deeprl/tests/cntk_utils_test.py new file mode 100644 index 000000000000..f5976f0e5d04 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/tests/cntk_utils_test.py @@ -0,0 +1,39 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import unittest + +import numpy as np + +from cntk.contrib.deeprl.agent.shared.cntk_utils import (huber_loss, + negative_of_entropy_with_softmax) +from cntk.ops import input_variable + + +class CNTKUtilsTest(unittest.TestCase): + """Unit tests for cntk_utils.""" + + def test_huber_loss(self): + i1 = input_variable((2)) + i2 = input_variable((2)) + + np.testing.assert_array_equal( + huber_loss(i1, i2).eval({ + i1: [[2, 1], [1, 5]], + i2: [[4, 1], [1, 4]] + }), + [1.5, 0.5] + ) + + def test_entropy(self): + i = input_variable((2)) + + np.testing.assert_almost_equal( + negative_of_entropy_with_softmax(i).eval({ + i: [[0.5, 0.5], [1000, 1]] + }), + [-0.693147181, 0] + ) diff --git a/bindings/python/cntk/contrib/deeprl/tests/data/initial_policy_network.dnn b/bindings/python/cntk/contrib/deeprl/tests/data/initial_policy_network.dnn new file mode 100644 index 000000000000..5c9b1fb2c814 Binary files /dev/null and b/bindings/python/cntk/contrib/deeprl/tests/data/initial_policy_network.dnn differ diff --git a/bindings/python/cntk/contrib/deeprl/tests/discretize_test.py b/bindings/python/cntk/contrib/deeprl/tests/discretize_test.py new file mode 100644 index 000000000000..7222b87802cc --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/tests/discretize_test.py @@ -0,0 +1,65 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import unittest + +import cntk.contrib.deeprl.tests.spaces as spaces +import numpy as np +from cntk.contrib.deeprl.agent.shared.discretize import BoxSpaceDiscretizer + + +class BoxSpaceDiscretizerTest(unittest.TestCase): + """Unit tests for BoxSpaceDiscretizer.""" + + def test_scalar(self): + s = spaces.Box(0, 1, (2,)) + sut = BoxSpaceDiscretizer(s, 10) + + self.assertEqual(sut.discretize([0, 0]), 0) + self.assertEqual(sut.discretize([0.05, 0]), 0) + self.assertEqual(sut.discretize([0.95, 0]), 90) + self.assertEqual(sut.discretize([0, 0.05]), 0) + self.assertEqual(sut.discretize([0, 0.95]), 9) + self.assertEqual(sut.discretize([0.1, 0.2]), 12) + self.assertEqual(sut.discretize([1, 1]), 99) + + def test_list(self): + s = spaces.Box(0, 1, (2,)) + sut = BoxSpaceDiscretizer(s, np.array([10, 2])) + + self.assertEqual(sut.discretize([0, 0]), 0) + self.assertEqual(sut.discretize([0.05, 0]), 0) + self.assertEqual(sut.discretize([0.95, 0]), 18) + self.assertEqual(sut.discretize([0, 0.05]), 0) + self.assertEqual(sut.discretize([0, 0.95]), 1) + self.assertEqual(sut.discretize([0.1, 0.2]), 2) + self.assertEqual(sut.discretize([1, 1]), 19) + + sut = BoxSpaceDiscretizer(s, np.array([10, 1])) + + self.assertEqual(sut.discretize([0, 0]), 0) + self.assertEqual(sut.discretize([0.05, 0]), 0) + self.assertEqual(sut.discretize([0.95, 0]), 9) + self.assertEqual(sut.discretize([0, 0.05]), 0) + self.assertEqual(sut.discretize([0, 0.95]), 0) + self.assertEqual(sut.discretize([0.1, 0.2]), 1) + self.assertEqual(sut.discretize([1, 1]), 9) + + def test_array(self): + s = spaces.Box(0, 1, (2, 2)) + sut = BoxSpaceDiscretizer(s, np.array([[2, 2], [2, 2]])) + + self.assertEqual(sut.discretize([[0, 0], [0, 0]]), 0) + self.assertEqual(sut.discretize([[0.05, 0], [0, 0]]), 0) + self.assertEqual(sut.discretize([[0.95, 0], [0, 0]]), 8) + self.assertEqual(sut.discretize([[0, 0.05], [0, 0]]), 0) + self.assertEqual(sut.discretize([[0, 0.95], [0, 0]]), 4) + self.assertEqual(sut.discretize([[0, 0], [0.05, 0]]), 0) + self.assertEqual(sut.discretize([[0, 0], [0.95, 0]]), 2) + self.assertEqual(sut.discretize([[0, 0], [0, 0.05]]), 0) + self.assertEqual(sut.discretize([[0, 0], [0, 0.95]]), 1) + self.assertEqual(sut.discretize([[0.1, 0.6], [0.5, 0.2]]), 6) + self.assertEqual(sut.discretize([[1, 1], [1, 1]]), 15) diff --git a/bindings/python/cntk/contrib/deeprl/tests/models_test.py b/bindings/python/cntk/contrib/deeprl/tests/models_test.py new file mode 100644 index 000000000000..b661b6d32548 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/tests/models_test.py @@ -0,0 +1,29 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import unittest + +from cntk.contrib.deeprl.agent.shared.models import Models + + +class ModelsTest(unittest.TestCase): + """Unit tests for Models.""" + + def test_parse_dueling_network_structure(self): + a, b, c =\ + Models._parse_dueling_network_structure( + "[1, 2, [3], [4, 5]]") + self.assertEqual(a, [1, 2]) + self.assertIsInstance(a[0], int) + self.assertEqual(b, [3]) + self.assertEqual(c, [4, 5]) + + a, b, c =\ + Models._parse_dueling_network_structure( + "[None, [3], [None]]") + self.assertEqual(a, []) + self.assertEqual(b, [3]) + self.assertEqual(c, []) diff --git a/bindings/python/cntk/contrib/deeprl/tests/policy_gradient_test.py b/bindings/python/cntk/contrib/deeprl/tests/policy_gradient_test.py new file mode 100644 index 000000000000..07facc9d0746 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/tests/policy_gradient_test.py @@ -0,0 +1,421 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import unittest +from unittest.mock import MagicMock, Mock, patch + +import cntk.contrib.deeprl.tests.spaces as spaces +import numpy as np +from cntk.contrib.deeprl.agent.policy_gradient import ActorCritic +from cntk.layers import Dense +from cntk.losses import cross_entropy_with_softmax +from cntk.ops import input_variable, placeholder + + +class PolicyGradientTest(unittest.TestCase): + """Unit tests for policy gradient.""" + + @patch('cntk.contrib.deeprl.agent.policy_gradient.Models.feedforward_network') + def test_init(self, mock_model): + mock_model.side_effect = self._setup_test_model + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = ActorCritic('', observation_space, action_space) + + self.assertEqual(sut._num_actions, 2) + self.assertIsNone(sut._num_states) + self.assertEqual(sut._shape_of_inputs, (1,)) + self.assertFalse(sut._discrete_observation_space) + self.assertIsNone(sut._space_discretizer) + self.assertIsNone(sut._preprocessor) + self.assertEqual(mock_model.call_count, 2) + mock_model.assert_has_calls( + [ + unittest.mock.call((1,), 2, '[10]', cross_entropy_with_softmax, + use_placeholder_for_input=True), + unittest.mock.call((1,), 1, '[10]', use_placeholder_for_input=True) + ], + any_order=True) + + @unittest.skip("Skip this as CNTK can't reset UID during test.") + @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters') + def test_init_from_existing_model(self, mock_parameters): + action_space = spaces.Discrete(3) + observation_space = spaces.Box( + np.array([-1.2, -0.07]), np.array([0.6, 0.07])) + mock_parameters.return_value.policy_representation = 'nn' + mock_parameters.return_value.policy_network_hidden_layers = '[2]' + mock_parameters.return_value.initial_policy_network = \ + 'tests/data/initial_policy_network.dnn' + mock_parameters.return_value.preprocessing = '' + + sut = ActorCritic('', observation_space, action_space) + + self.assertEqual(sut._num_actions, 3) + self.assertIsNone(sut._num_states) + self.assertEqual(sut._shape_of_inputs, (2,)) + self.assertFalse(sut._discrete_observation_space) + self.assertIsNone(sut._space_discretizer) + self.assertIsNone(sut._preprocessor) + + # Incompatible network structure. + mock_parameters.return_value.policy_network_hidden_layers = '[]' + self.assertRaises( + Exception, ActorCritic, '', observation_space, action_space) + + # Incompatible action space. + mock_parameters.return_value.policy_network_hidden_layers = '[2]' + action_space = spaces.Discrete(2) + self.assertRaises( + ValueError, ActorCritic, '', observation_space, action_space) + + # Incompatible observation space. + action_space = spaces.Discrete(3) + observation_space = spaces.Box( + np.array([-1.2, -0.07, -1.0]), np.array([0.6, 0.07, 1.0])) + self.assertRaises( + ValueError, ActorCritic, '', observation_space, action_space) + + @patch('cntk.contrib.deeprl.agent.policy_gradient.Models.feedforward_network') + @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters') + def test_init_preprocess(self, mock_parameters, mock_model): + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.preprocessing = \ + 'cntk.contrib.deeprl.agent.shared.preprocessing.SlidingWindow' + mock_parameters.return_value.preprocessing_args = '(2, )' + mock_model.side_effect = self._setup_test_model + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = ActorCritic('', observation_space, action_space) + + self.assertIsNotNone(sut._preprocessor) + self.assertEqual(sut._preprocessor.output_shape(), (2, 1)) + self.assertEqual(mock_model.call_count, 2) + mock_model.assert_has_calls( + [ + unittest.mock.call((2, 1), 2, '[2]', cross_entropy_with_softmax, + use_placeholder_for_input=True), + unittest.mock.call((2, 1), 1, '[2]', use_placeholder_for_input=True) + ], + any_order=True) + + @patch('cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn') + @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters') + def test_init_customized_model(self, mock_parameters, mock_model): + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.policy_representation = \ + 'cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn' + mock_parameters.return_value.value_function_representation = \ + 'cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn' + mock_model.side_effect = self._setup_test_model + + sut = ActorCritic('', observation_space, action_space) + + self.assertEqual(mock_model.call_count, 2) + mock_model.assert_has_calls( + [ + unittest.mock.call((1,), 2, cross_entropy_with_softmax, + use_placeholder_for_input=True), + unittest.mock.call((1,), 1, use_placeholder_for_input=True) + ], + any_order=True) + + @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters') + def test_init_unsupported_model(self, mock_parameters): + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + self._setup_parameters(mock_parameters.return_value) + + # Verify sut can be constructed. + sut = ActorCritic('', observation_space, action_space) + + mock_parameters.return_value.policy_representation = 'undefined' + self.assertRaises( + ValueError, ActorCritic, '', observation_space, action_space) + + mock_parameters.return_value.policy_representation = 'nn' + mock_parameters.return_value.value_function_representation = 'undefined' + self.assertRaises( + ValueError, ActorCritic, '', observation_space, action_space) + + @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters') + def test_init_shared_representation(self, mock_parameters): + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.shared_representation = True + + sut = ActorCritic('', observation_space, action_space) + + self.assertEqual(sut._num_actions, 2) + self.assertIsNone(sut._num_states) + self.assertEqual(sut._shape_of_inputs, (1,)) + self.assertFalse(sut._discrete_observation_space) + self.assertIsNone(sut._space_discretizer) + self.assertIsNone(sut._preprocessor) + + self.assertTrue( + set(sut._policy_network.parameters).issubset( + set(sut._value_network.parameters))) + diff = set(sut._value_network.parameters).difference( + set(sut._policy_network.parameters)) + # one for W and one for b + self.assertEqual(len(diff), 2) + + shapes = [] + for item in diff: + shapes.append(item.shape) + self.assertEqual(set(shapes), {(2, 1), (1,)}) + + def test_rollout(self): + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = ActorCritic('', observation_space, action_space) + + sut._choose_action = Mock(side_effect=[(0, ''), (1, ''), (1, '')]) + + sut.start(np.array([0.1], np.float32)) + sut.step(0.1, np.array([0.2], np.float32)) + sut.step(0.2, np.array([0.3], np.float32)) + + self.assertEqual(sut._trajectory_rewards, [0.1, 0.2]) + self.assertEqual(sut._trajectory_actions, [0, 1, 1]) + self.assertEqual(sut._trajectory_states, [0.1, 0.2, 0.3]) + + sut.end(0.3, np.array([0.4], np.float32)) + + self.assertEqual(sut._trajectory_rewards, [0.1, 0.2, 0.3]) + self.assertEqual(sut._trajectory_actions, [0, 1, 1]) + self.assertEqual(sut._trajectory_states, [0.1, 0.2, 0.3]) + + @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters') + def test_rollout_preprocess(self, mock_parameters): + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.preprocessing = \ + 'cntk.contrib.deeprl.agent.shared.preprocessing.SlidingWindow' + mock_parameters.return_value.preprocessing_args = '(2, "float32")' + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = ActorCritic('', observation_space, action_space) + + sut._choose_action = Mock(side_effect=[(0, ''), (1, ''), (1, '')]) + + sut.start(np.array([0.1], np.float32)) + sut.step(0.1, np.array([0.2], np.float32)) + sut.step(0.2, np.array([0.3], np.float32)) + + self.assertEqual(sut._trajectory_rewards, [0.1, 0.2]) + self.assertEqual(sut._trajectory_actions, [0, 1, 1]) + np.testing.assert_array_equal( + sut._trajectory_states, + [ + np.array([[0], [0.1]], np.float32), + np.array([[0.1], [0.2]], np.float32), + np.array([[0.2], [0.3]], np.float32) + ]) + + sut.end(0.3, np.array([0.4], np.float32)) + + self.assertEqual(sut._trajectory_rewards, [0.1, 0.2, 0.3]) + self.assertEqual(sut._trajectory_actions, [0, 1, 1]) + np.testing.assert_array_equal( + sut._trajectory_states, + [ + np.array([[0], [0.1]], np.float32), + np.array([[0.1], [0.2]], np.float32), + np.array([[0.2], [0.3]], np.float32) + ]) + + @patch('cntk.contrib.deeprl.agent.policy_gradient.PolicyGradientParameters') + def test_rollout_with_update(self, mock_parameters): + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.update_frequency = 2 + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = ActorCritic('', observation_space, action_space) + sut._update_networks = MagicMock() + + sut._choose_action = Mock(side_effect=[ + (0, ''), (1, ''), (1, ''), (0, ''), (1, ''), (0, '')]) + + sut.start(np.array([0.1], np.float32)) + sut.step(0.1, np.array([0.2], np.float32)) + self.assertEqual(sut._trajectory_rewards, [0.1]) + self.assertEqual(sut._trajectory_actions, [0, 1]) + self.assertEqual(sut._trajectory_states, [0.1, 0.2]) + self.assertEqual(sut._update_networks.call_count, 0) + + sut.step(0.2, np.array([0.3], np.float32)) + self.assertEqual(sut._trajectory_rewards, []) + self.assertEqual(sut._trajectory_actions, [1]) + self.assertEqual(sut._trajectory_states, [0.3]) + self.assertEqual(sut._update_networks.call_count, 1) + + sut.step(0.3, np.array([0.4], np.float32)) + self.assertEqual(sut._trajectory_rewards, [0.3]) + self.assertEqual(sut._trajectory_actions, [1, 0]) + self.assertEqual(sut._trajectory_states, [0.3, 0.4]) + self.assertEqual(sut._update_networks.call_count, 1) + + sut.start(np.array([0.5], np.float32)) + self.assertEqual(sut._trajectory_rewards, []) + self.assertEqual(sut._trajectory_actions, [1]) + self.assertEqual(sut._trajectory_states, [0.5]) + self.assertEqual(sut._update_networks.call_count, 1) + + sut.step(0.4, np.array([0.6], np.float32)) + self.assertEqual(sut._trajectory_rewards, []) + self.assertEqual(sut._trajectory_actions, [0]) + self.assertEqual(sut._trajectory_states, [0.6]) + self.assertEqual(sut._update_networks.call_count, 2) + + sut.end(0.5, np.array([0.7], np.float32)) + self.assertEqual(sut._trajectory_rewards, [0.5]) + self.assertEqual(sut._trajectory_actions, [0]) + self.assertEqual(sut._trajectory_states, [0.6]) + self.assertEqual(sut._update_networks.call_count, 2) + + def test_process_accumulated_trajectory(self): + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = ActorCritic('', observation_space, action_space) + + # Set up. + self._setup_trajectory(sut) + + # Call test method. + sut._process_accumulated_trajectory(False) + + # Verify results. + self.assertEqual(len(sut._trajectory_rewards), 0) + self.assertEqual(len(sut._trajectory_actions), 0) + self.assertEqual(len(sut._trajectory_states), 0) + + np.testing.assert_array_equal( + sut._input_buffer, + [np.array([0.1], np.float32), np.array([0.2], np.float32)]) + # For unknown reason, got [2.9974999999999996] instead of [2.9975] for + # the following testcase, therefore use assert_array_almost_equal. + np.testing.assert_array_almost_equal( + sut._value_network_output_buffer, + [ + [2.9975], # 3.05 * 0.95 + 0.1 + [3.05] # 3 (initial_r) * 0.95 + 0.2 + ]) + np.testing.assert_array_equal( + sut._policy_network_output_buffer, + [ + np.array([1, 0], np.float32), + np.array([0, 1], np.float32) + ] + ) + np.testing.assert_array_almost_equal( + sut._policy_network_weight_buffer, + [ + [0.9975], # 2.9975 - 2 + [2.05] # 3.05 - 1 + ]) + + def test_process_accumulated_trajectory_keep_last(self): + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = ActorCritic('', observation_space, action_space) + + # Set up. + self._setup_trajectory(sut) + + # Call test method. + sut._process_accumulated_trajectory(True) + + # Verify results. + self.assertEqual(len(sut._trajectory_rewards), 0) + self.assertEqual(len(sut._trajectory_actions), 0) + self.assertEqual(sut._trajectory_states, [np.array([0.3], np.float32)]) + + def test_update_policy_and_value_function(self): + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = ActorCritic('', observation_space, action_space) + + # Set up. + self._setup_trajectory(sut) + sut._process_accumulated_trajectory(True) + sut._trainer = MagicMock() + sut._adjust_learning_rate = MagicMock() + + # Call test method. + sut._update_networks() + + # Verify value network behavior. + self.assertEqual( + sut._trainer.train_minibatch.call_count, 1) + call_args = sut._trainer.train_minibatch.call_args + np.testing.assert_array_equal( + call_args[0][0][sut._input_variables], + [np.array([0.1], np.float32), np.array([0.2], np.float32)]) + np.testing.assert_array_almost_equal( + call_args[0][0][sut._value_network_output_variables], + [[2.9975], [3.05]]) + np.testing.assert_array_equal( + call_args[0][0][sut._policy_network_output_variables], + [np.array([1, 0], np.float32), np.array([0, 1], np.float32)]) + np.testing.assert_array_almost_equal( + call_args[0][0][sut._policy_network_weight_variables], + [[0.9975], [2.05]]) + + # Verify data buffer size. + self.assertEqual(len(sut._input_buffer), 0) + + def _setup_parameters(self, params): + params.policy_representation = 'nn' + params.policy_network_hidden_layers = '[2]' + params.value_function_representation = 'nn' + params.value_network_hidden_layers = '[2]' + params.relative_step_size = 0.5 + params.regularization_weight = 0.001 + params.initial_eta = 0.1 + params.eta_decay_step_count = 10 + params.eta_minimum = 0.01 + params.gamma = 0.9 + params.preprocessing = '' + params.preprocessing_args = '()' + params.shared_representation = False + params.update_frequency = 4 + params.initial_policy_network = '' + params.momentum = 0.95 + + def _setup_trajectory(self, sut): + # Corresponds to the case where sut.end() is not called. + sut._trajectory_rewards = [0.1, 0.2] + sut._trajectory_actions = [0, 1] + sut._trajectory_states = [ + np.array([0.1], np.float32), + np.array([0.2], np.float32), + np.array([0.3], np.float32)] + sut._value_network.eval = MagicMock(side_effect=[ + np.array([[[3]]], np.float32), + np.array([[[2]]], np.float32), + np.array([[[1]]], np.float32)]) + + def _setup_test_model(self, *args, **kwargs): + inputs = placeholder(shape=(1,)) + outputs = input_variable(shape=(1,), dtype=np.float32) + + q = Dense(1, activation=None)(inputs) + loss = cross_entropy_with_softmax(q, outputs) + + return { + 'inputs': inputs, + 'outputs': outputs, + 'f': q, + 'loss': loss + } diff --git a/bindings/python/cntk/contrib/deeprl/tests/preprocessing_test.py b/bindings/python/cntk/contrib/deeprl/tests/preprocessing_test.py new file mode 100644 index 000000000000..d1ade1d80503 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/tests/preprocessing_test.py @@ -0,0 +1,81 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import unittest + +import numpy as np + +from cntk.contrib.deeprl.agent.shared.preprocessing import AtariPreprocessing + + +class AtariPreprocessingTest(unittest.TestCase): + """Unit tests for AtariPreprocessing.""" + + def test_atari_preprocessing(self): + p = AtariPreprocessing((210, 160, 3), 4) + self.assertEqual(p._AtariPreprocessing__history_len, 4) + np.testing.assert_array_equal( + p._AtariPreprocessing__previous_raw_image, + np.zeros((210, 160, 3), dtype='uint8')) + self.assertEqual(len(p._AtariPreprocessing__processed_image_seq), 4) + np.testing.assert_array_equal( + p._AtariPreprocessing__processed_image_seq[0], + np.zeros((84, 84), dtype='uint8')) + np.testing.assert_array_equal( + p._AtariPreprocessing__processed_image_seq[-1], + np.zeros((84, 84), dtype='uint8')) + + r = p.preprocess(np.ones((210, 160, 3), dtype=np.uint8)) + np.testing.assert_array_equal( + p._AtariPreprocessing__previous_raw_image, + np.ones((210, 160, 3), dtype=np.uint8)) + self.assertEqual(len(p._AtariPreprocessing__processed_image_seq), 4) + np.testing.assert_array_equal( + p._AtariPreprocessing__processed_image_seq[0], + np.zeros((84, 84), dtype='uint8')) + np.testing.assert_array_equal( + p._AtariPreprocessing__processed_image_seq[-1], + np.ones((84, 84), dtype='uint8')) + self.assertEqual(r.shape, (4, 84, 84)) + np.testing.assert_array_equal( + np.squeeze(r[3, :, :]), + np.ones((84, 84), dtype='uint8')) + + p.preprocess(np.ones((210, 160, 3), dtype=np.uint8) * 2) + p.preprocess(np.ones((210, 160, 3), dtype=np.uint8) * 3) + r = p.preprocess(np.ones((210, 160, 3), dtype=np.uint8) * 4) + np.testing.assert_array_equal( + p._AtariPreprocessing__previous_raw_image, + np.ones((210, 160, 3), dtype='uint8') * 4) + self.assertEqual(len(p._AtariPreprocessing__processed_image_seq), 4) + np.testing.assert_array_equal( + p._AtariPreprocessing__processed_image_seq[0], + np.ones((84, 84), dtype='uint8')) + np.testing.assert_array_equal( + p._AtariPreprocessing__processed_image_seq[1], + np.ones((84, 84), dtype='uint8') * 2) + np.testing.assert_array_equal( + p._AtariPreprocessing__processed_image_seq[2], + np.ones((84, 84), dtype='uint8') * 3) + np.testing.assert_array_equal( + p._AtariPreprocessing__processed_image_seq[3], + np.ones((84, 84), dtype='uint8') * 4) + self.assertEqual(r.shape, (4, 84, 84)) + np.testing.assert_array_equal( + np.squeeze(r[3, :, :]), + np.ones((84, 84), dtype='uint8') * 4) + + p.reset() + np.testing.assert_array_equal( + p._AtariPreprocessing__previous_raw_image, + np.zeros((210, 160, 3), dtype='uint8')) + self.assertEqual(len(p._AtariPreprocessing__processed_image_seq), 4) + np.testing.assert_array_equal( + p._AtariPreprocessing__processed_image_seq[0], + np.zeros((84, 84), dtype='uint8')) + np.testing.assert_array_equal( + p._AtariPreprocessing__processed_image_seq[-1], + np.zeros((84, 84), dtype='uint8')) diff --git a/bindings/python/cntk/contrib/deeprl/tests/qlearning_test.py b/bindings/python/cntk/contrib/deeprl/tests/qlearning_test.py new file mode 100644 index 000000000000..8c84b331057e --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/tests/qlearning_test.py @@ -0,0 +1,491 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import unittest +from unittest.mock import MagicMock, Mock, patch + +import cntk.contrib.deeprl.tests.spaces as spaces +import numpy as np +from cntk.contrib.deeprl.agent.qlearning import QLearning +from cntk.contrib.deeprl.agent.shared.cntk_utils import huber_loss +from cntk.contrib.deeprl.agent.shared.replay_memory import _Transition +from cntk.layers import Dense +from cntk.losses import squared_error +from cntk.ops import input_variable + + +class QLearningTest(unittest.TestCase): + """Unit tests for QLearning.""" + + @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory') + @patch('cntk.contrib.deeprl.agent.qlearning.Models.feedforward_network') + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_init_dqn(self, + mock_parameters, + mock_model, + mock_replay_memory): + self._setup_parameters(mock_parameters.return_value) + mock_model.return_value = self._setup_test_model() + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + + self.assertEqual(sut._num_actions, 2) + self.assertIsNone(sut._num_states) + self.assertEqual(sut._shape_of_inputs, (1,)) + self.assertFalse(sut._discrete_observation_space) + self.assertIsNone(sut._space_discretizer) + self.assertIsNone(sut._preprocessor) + self.assertFalse(hasattr(sut, 'weight_variables')) + self.assertIsNotNone(sut._trainer) + mock_model.assert_called_with((1,), 2, '[2]', None) + mock_replay_memory.assert_called_with(100, False) + + @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory') + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_init_dqn_prioritized_replay(self, + mock_parameters, + mock_replay_memory): + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.use_prioritized_replay = True + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + + self.assertIsNotNone(sut._weight_variables) + mock_replay_memory.assert_called_with(100, True) + + @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory') + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_init_dqn_preprocessing(self, + mock_parameters, + mock_replay_memory): + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.preprocessing = \ + 'cntk.contrib.deeprl.agent.shared.preprocessing.AtariPreprocessing' + mock_parameters.return_value.preprocessing_args = '()' + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + + # Preprocessor with default arguments. + self.assertIsNotNone(sut._preprocessor) + self.assertEqual(sut._preprocessor.output_shape(), (4, 84, 84)) + + # Preprocessor with arguments passed as a tuple. + mock_parameters.return_value.preprocessing_args = '(3,)' + sut = QLearning('', observation_space, action_space) + self.assertEqual(sut._preprocessor.output_shape(), (3, 84, 84)) + + # Preprocessor with inappropriate arguments. + mock_parameters.return_value.preprocessing_args = '(3, 4)' + self.assertRaises( + TypeError, QLearning, '', observation_space, action_space) + + # Undefined preprocessor. + mock_parameters.return_value.preprocessing = 'undefined' + self.assertRaises( + ValueError, QLearning, '', observation_space, action_space) + + @patch('cntk.contrib.deeprl.agent.qlearning.Models.dueling_network') + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_init_dueling_dqn(self, mock_parameters, mock_model): + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.q_representation = 'dueling-dqn' + mock_parameters.return_value.hidden_layers = '[2, [2], [2]]' + mock_model.return_value = self._setup_test_model() + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + + self.assertEqual(sut._num_actions, 2) + self.assertIsNone(sut._num_states) + self.assertEqual(sut._shape_of_inputs, (1,)) + self.assertFalse(sut._discrete_observation_space) + self.assertIsNone(sut._space_discretizer) + self.assertIsNone(sut._preprocessor) + mock_model.assert_called_with((1,), 2, '[2, [2], [2]]', None) + + @patch('cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn') + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_init_customized_q(self, mock_parameters, mock_model): + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.q_representation = \ + 'cntk.contrib.deeprl.agent.shared.customized_models.conv_dqn' + mock_model.return_value = self._setup_test_model() + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + + self.assertEqual(sut._num_actions, 2) + self.assertIsNone(sut._num_states) + self.assertEqual(sut._shape_of_inputs, (1,)) + self.assertFalse(sut._discrete_observation_space) + self.assertIsNone(sut._space_discretizer) + self.assertIsNone(sut._preprocessor) + mock_model.assert_called_with((1,), 2, None) + + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_init_unsupported_q(self, mock_parameters): + instance = mock_parameters.return_value + instance.q_representation = 'undefined' + instance.preprocessing = '' + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + self.assertRaises( + ValueError, QLearning, '', observation_space, action_space) + + @patch('cntk.contrib.deeprl.agent.qlearning.Models.feedforward_network') + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_init_dqn_huber_loss(self, mock_parameters, mock_model): + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.use_error_clipping = True + mock_model.return_value = self._setup_test_model() + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + + mock_model.assert_called_with((1,), 2, '[2]', huber_loss) + + @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory') + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_update_q(self, + mock_parameters, + mock_replay_memory): + """Test if _update_q_periodically() can finish successfully.""" + self._setup_parameters(mock_parameters.return_value) + self._setup_replay_memory(mock_replay_memory.return_value) + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + sut._trainer.train_minibatch = MagicMock() + sut._choose_action = MagicMock(side_effect=[ + (1, 'GREEDY'), + (0, 'GREEDY'), + (1, 'RANDOM'), + ]) + + action, debug_info = sut.start(np.array([0.1], np.float32)) + self.assertEqual(action, 1) + self.assertEqual(debug_info['action_behavior'], 'GREEDY') + self.assertEqual(sut.episode_count, 1) + self.assertEqual(sut.step_count, 0) + self.assertEqual(sut._epsilon, 0.1) + self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.1) + self.assertEqual(sut._last_state, np.array([0.1], np.float32)) + self.assertEqual(sut._last_action, 1) + + action, debug_info = sut.step(1, np.array([0.2], np.float32)) + self.assertEqual(action, 0) + self.assertEqual(debug_info['action_behavior'], 'GREEDY') + self.assertEqual(sut.episode_count, 1) + self.assertEqual(sut.step_count, 1) + self.assertEqual(sut._epsilon, 0.09) + # learning rate remains 0.1 as Q is not updated during this time step. + self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.1) + self.assertEqual(sut._last_state, np.array([0.2], np.float32)) + self.assertEqual(sut._last_action, 0) + + action, debug_info = sut.step(2, np.array([0.3], np.float32)) + self.assertEqual(action, 1) + self.assertEqual(debug_info['action_behavior'], 'RANDOM') + self.assertEqual(sut.episode_count, 1) + self.assertEqual(sut.step_count, 2) + self.assertEqual(sut._epsilon, 0.08) + self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.08) + self.assertEqual(sut._last_state, np.array([0.3], np.float32)) + self.assertEqual(sut._last_action, 1) + + sut.end(3, np.array([0.4], np.float32)) + self.assertEqual(sut.episode_count, 1) + self.assertEqual(sut.step_count, 3) + self.assertEqual(sut._epsilon, 0.08) + # learning rate remains 0.08 as Q is not updated during this time step. + self.assertEqual(sut._trainer.parameter_learners[0].learning_rate(), 0.08) + + @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory') + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_update_q_dqn(self, + mock_parameters, + mock_replay_memory): + self._setup_parameters(mock_parameters.return_value) + self._setup_replay_memory(mock_replay_memory.return_value) + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + + sut._q.eval = \ + MagicMock(return_value=np.array([[[0.2, 0.1]]], np.float32)) + sut._target_q.eval = \ + MagicMock(return_value=np.array([[[0.3, 0.4]]], np.float32)) + sut._trainer = MagicMock() + + sut._update_q_periodically() + + np.testing.assert_array_equal( + sut._trainer.train_minibatch.call_args[0][0][sut._input_variables], + [np.array([0.1], np.float32)]) + # 10 (reward) + 0.9 (gamma) x 0.4 (max q_target) -> update action 0 + np.testing.assert_array_equal( + sut._trainer.train_minibatch.call_args[0][0][sut._output_variables], + [np.array([10.36, 0.1], np.float32)]) + + @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory') + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_update_q_dqn_prioritized_replay(self, + mock_parameters, + mock_replay_memory): + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.use_prioritized_replay = True + self._setup_prioritized_replay_memory(mock_replay_memory.return_value) + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + + def new_q_value(self): + return np.array([[[0.2, 0.1]]], np.float32) + sut._q.eval = MagicMock(side_effect=new_q_value) + sut._target_q.eval = MagicMock( + return_value=np.array([[[0.3, 0.4]]], np.float32)) + sut._trainer = MagicMock() + + sut._update_q_periodically() + + self.assertEqual(sut._trainer.train_minibatch.call_count, 1) + np.testing.assert_array_equal( + sut._trainer.train_minibatch.call_args[0][0][sut._input_variables], + [ + np.array([0.1], np.float32), + np.array([0.3], np.float32), + np.array([0.1], np.float32) + ]) + np.testing.assert_array_equal( + sut._trainer.train_minibatch.call_args[0][0][sut._output_variables], + [ + # 10 (reward) + 0.9 (gamma) x 0.4 (max q_target) + np.array([10.36, 0.1], np.float32), + # 11 (reward) + 0.9 (gamma) x 0.4 (max q_target) + np.array([0.2, 11.36], np.float32), + np.array([10.36, 0.1], np.float32) + ]) + np.testing.assert_almost_equal( + sut._trainer.train_minibatch.call_args[0][0][sut._weight_variables], + [ + [0.16666667], + [0.66666667], + [0.16666667] + ]) + self.assertAlmostEqual( + sut._replay_memory.update_priority.call_args[0][0][3], + 105.2676) # (10.16 + 0.1)^2 + self.assertAlmostEqual( + sut._replay_memory.update_priority.call_args[0][0][4], + 129.0496, + places=6) # (11.26 + 0.1) ^ 2 + + @patch('cntk.contrib.deeprl.agent.qlearning.ReplayMemory') + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_update_q_double_dqn(self, + mock_parameters, + mock_replay_memory): + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.double_q_learning = True + self._setup_replay_memory(mock_replay_memory.return_value) + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + + sut._q.eval = \ + MagicMock(return_value=np.array([[[0.2, 0.1]]], np.float32)) + sut._target_q.eval = \ + MagicMock(return_value=np.array([[[0.3, 0.4]]], np.float32)) + sut._trainer = MagicMock() + + sut._update_q_periodically() + + # 10 (reward) + 0.9 (gamma) x 0.3 -> update action 0 + np.testing.assert_array_equal( + sut._trainer.train_minibatch.call_args[0][0][sut._output_variables], + [np.array([10.27, 0.1], np.float32)]) + + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_populate_replay_memory(self, mock_parameters): + self._setup_parameters(mock_parameters.return_value) + mock_parameters.return_value.preprocessing = \ + 'cntk.contrib.deeprl.agent.shared.preprocessing.SlidingWindow' + mock_parameters.return_value.preprocessing_args = '(2, )' + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + + sut._compute_priority = Mock(side_effect=[1, 2, 3]) + sut._choose_action = Mock( + side_effect=[(0, ''), (0, ''), (1, ''), (1, '')]) + sut._replay_memory = MagicMock() + sut._update_q_periodically = MagicMock() + + sut.start(np.array([0.1], np.float32)) + sut.step(0.1, np.array([0.2], np.float32)) + sut.step(0.2, np.array([0.3], np.float32)) + sut.end(0.3, np.array([0.4], np.float32)) + + self.assertEqual(sut._replay_memory.store.call_count, 3) + + call_args = sut._replay_memory.store.call_args_list[0] + np.testing.assert_array_equal( + call_args[0][0], + np.array([[0], [0.1]], np.float32)) + self.assertEqual(call_args[0][1], 0) + self.assertEqual(call_args[0][2], 0.1) + np.testing.assert_array_equal( + call_args[0][3], + np.array([[0.1], [0.2]], np.float32)) + self.assertEqual(call_args[0][4], 1) + + call_args = sut._replay_memory.store.call_args_list[2] + np.testing.assert_array_equal( + call_args[0][0], + np.array([[0.2], [0.3]], np.float32)) + self.assertEqual(call_args[0][1], 1) + self.assertEqual(call_args[0][2], 0.3) + self.assertIsNone(call_args[0][3]) + self.assertEqual(call_args[0][4], 3) + + @patch('cntk.contrib.deeprl.agent.qlearning.QLearningParameters') + def test_replay_start_size(self, mock_parameters): + self._setup_parameters(mock_parameters.return_value) + # Set exploration rate to 0 + mock_parameters.return_value.initial_epsilon = 0 + mock_parameters.return_value.epsilon_decay_step_count = 100 + mock_parameters.return_value.epsilon_minimum = 0 + mock_parameters.return_value.replay_start_size = 3 + + action_space = spaces.Discrete(2) + observation_space = spaces.Box(0, 1, (1,)) + sut = QLearning('', observation_space, action_space) + sut._trainer = MagicMock() + sut._replay_memory = MagicMock() + + _, debug = sut.start(np.array([0.1], np.float32)) + self.assertEqual(sut.step_count, 0) + self.assertEqual(sut._trainer.train_minibatch.call_count, 0) + self.assertEqual(debug['action_behavior'], 'RANDOM') + + _, debug = sut.step(0.1, np.array([0.2], np.float32)) + self.assertEqual(sut.step_count, 1) + self.assertEqual(sut._trainer.train_minibatch.call_count, 0) + self.assertEqual(debug['action_behavior'], 'RANDOM') + + sut.end(0.2, np.array([0.3], np.float32)) + self.assertEqual(sut.step_count, 2) + self.assertEqual(sut._trainer.train_minibatch.call_count, 0) + + _, debug = sut.start(np.array([0.4], np.float32)) + self.assertEqual(sut.step_count, 2) + self.assertEqual(sut._trainer.train_minibatch.call_count, 0) + self.assertEqual(debug['action_behavior'], 'RANDOM') + + a, debug = sut.step(0.3, np.array([0.5], np.float32)) + self.assertEqual(sut.step_count, 3) + self.assertEqual(sut._trainer.train_minibatch.call_count, 0) + self.assertEqual(debug['action_behavior'], 'GREEDY') + + a, debug = sut.start(np.array([0.6], np.float32)) + self.assertEqual(sut.step_count, 3) + self.assertEqual(sut._trainer.train_minibatch.call_count, 0) + self.assertEqual(debug['action_behavior'], 'GREEDY') + + a, debug = sut.step(0.4, np.array([0.7], np.float32)) + self.assertEqual(sut.step_count, 4) + self.assertEqual(sut._trainer.train_minibatch.call_count, 1) + self.assertEqual(debug['action_behavior'], 'GREEDY') + + def _setup_parameters(self, parameters): + parameters.q_representation = 'dqn' + parameters.hidden_layers = '[2]' + parameters.initial_epsilon = 0.1 + parameters.epsilon_decay_step_count = 9 + parameters.epsilon_minimum = 0.01 + parameters.initial_eta = 0.1 + parameters.eta_decay_step_count = 9 + parameters.eta_minimum = 0.01 + parameters.momentum = 0.95 + parameters.gradient_clipping_threshold = 10 + parameters.q_update_frequency = 2 + parameters.gamma = 0.9 + parameters.double_q_learning = False + parameters.replay_start_size = 0 + parameters.replay_memory_capacity = 100 + parameters.use_prioritized_replay = False + parameters.priority_alpha = 2 + parameters.priority_beta = 2 + parameters.priority_epsilon = 0.1 + parameters.preprocessing = '' + parameters.use_error_clipping = False + parameters.replays_per_update = 1 + + def _setup_replay_memory(self, replay_memory): + replay_memory.sample_minibatch.side_effect = \ + [[(0, _Transition( + np.array([0.1], np.float32), + 0, + 10, + np.array([0.2], np.float32), + 0.01))], + [(1, _Transition( + np.array([0.3], np.float32), + 1, + -10, + np.array([0.4], np.float32), + 0.02))]] + + def _setup_prioritized_replay_memory(self, replay_memory): + # Duplicated values can be returned. + replay_memory.sample_minibatch.return_value = \ + [(3, _Transition( + np.array([0.1], np.float32), + 0, + 10, + np.array([0.2], np.float32), + 2)), + (4, _Transition( + np.array([0.3], np.float32), + 1, + 11, + np.array([0.4], np.float32), + 1)), + (3, _Transition( + np.array([0.1], np.float32), + 0, + 10, + np.array([0.2], np.float32), + 2))] + + def _setup_test_model(self): + inputs = input_variable(shape=(1,), dtype=np.float32) + outputs = input_variable(shape=(1,), dtype=np.float32) + + q = Dense(1, activation=None)(inputs) + loss = squared_error(q, outputs) + + return { + 'inputs': inputs, + 'outputs': outputs, + 'f': q, + 'loss': loss + } diff --git a/bindings/python/cntk/contrib/deeprl/tests/replay_memory_test.py b/bindings/python/cntk/contrib/deeprl/tests/replay_memory_test.py new file mode 100644 index 000000000000..f6a5912e5089 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/tests/replay_memory_test.py @@ -0,0 +1,71 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import unittest + +from cntk.contrib.deeprl.agent.shared.replay_memory import ReplayMemory + + +class ReplayMemoryTest(unittest.TestCase): + """Unit tests for ReplayMemory.""" + + def test_uniform_sampling(self): + sut = ReplayMemory(3) + self.assertEqual(sut.sample_minibatch(1), []) + + sut.store(1, 'ignore', 'ignore', 'ignore', 0) + self.assertEqual(sut.size(), 1) + self.assertEqual([s[0] for s in sut.sample_minibatch(1)], [0]) + self.assertEqual([s[0] for s in sut.sample_minibatch(2)], [0]) + + sut.store(2, 'ignore', 'ignore', 'ignore', 0) + sut.store(3, 'ignore', 'ignore', 'ignore', 0) + self.assertEqual(sut.size(), 3) + samples = sut.sample_minibatch(1) + self.assertEqual(len(samples), 1) + self.assertTrue(set(s[0] for s in samples).issubset([0, 1, 2])) + self.assertTrue(set(s[1].state for s in samples).issubset([1, 2, 3])) + + sut.store(4, 'ignore', 'ignore', 'ignore', 0) + self.assertEqual(sut.size(), 3) + samples = sut.sample_minibatch(1) + self.assertEqual(len(samples), 1) + self.assertTrue(set(s[0] for s in samples).issubset([0, 1, 2])) + self.assertTrue(set(s[1].state for s in samples).issubset([2, 3, 4])) + + def test_prioritized_sampling(self): + sut = ReplayMemory(3, True) + self.assertEqual(sut.sample_minibatch(1), []) + + sut.store(1, 'ignore', 'ignore', 'ignore', 1) + self.assertEqual(sut.size(), 1) + self.assertEqual([s[0] for s in sut.sample_minibatch(1)], [2]) + self.assertEqual([s[0] for s in sut.sample_minibatch(2)], [2, 2]) + + sut.store(2, 'ignore', 'ignore', 'ignore', 3) + sut.store(3, 'ignore', 'ignore', 'ignore', 2) + self.assertEqual(sut.size(), 3) + self.assertEqual(len(sut._memory), 5) + self.assertEqual(sut._memory[:2], [6, 5]) + + samples = sut.sample_minibatch(2) + self.assertEqual(len(samples), 2) + self.assertEqual(samples[0][0], 3) + self.assertEqual(samples[0][1].state, 2) + + sut.store(4, 'ignore', 'ignore', 'ignore', 5) + self.assertEqual(sut.size(), 3) + self.assertEqual(sut._memory[:2], [10, 5]) + + samples = sut.sample_minibatch(2) + self.assertEqual(len(samples), 2) + self.assertIn(samples[0][0], [3, 4]) + self.assertIn(samples[0][1].state, [2, 3]) + self.assertEqual(samples[1][0], 2) + self.assertEqual(samples[1][1].state, 4) + + sut.update_priority({3: 4, 4: 0.5}) + self.assertEqual(sut._memory[:2], [9.5, 4.5]) diff --git a/bindings/python/cntk/contrib/deeprl/tests/spaces.py b/bindings/python/cntk/contrib/deeprl/tests/spaces.py new file mode 100644 index 000000000000..6da5d2acbf22 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/tests/spaces.py @@ -0,0 +1,51 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import numpy as np + + +class Box: + """Fake gym.spaces.box.Box to remove dependency on OpenAI gym.""" + + def __init__(self, low, high, shape=None): + if shape is None: + assert low.shape == high.shape + self.low = low + self.high = high + else: + assert np.isscalar(low) and np.isscalar(high) + self.low = low + np.zeros(shape) + self.high = high + np.zeros(shape) + + self.__class__.__module__ = 'gym.spaces.box' + + @property + def shape(self): + return self.low.shape + + +class Discrete: + """Fake gym.spaces.discrete.Discrete to remove dependency on OpenAI gym.""" + + def __init__(self, n): + self.n = n + self.__class__.__module__ = 'gym.spaces.discrete' + + +class Tuple: + """Fake gym.spaces.tuple_space.Tuple to remove dependency on OpenAI gym.""" + + def __init__(self, spaces): + self.spaces = spaces + self.__class__.__module__ = 'gym.spaces.tuple_space' + + +class MultiBinary: + """Fake gym.spaces.multi_binary.MultiBinary to remove dependency on OpenAI gym.""" + + def __init__(self, n): + self.n = n + self.__class__.__module__ = 'gym.spaces.multi_binary' diff --git a/bindings/python/cntk/contrib/deeprl/tests/tabular_qlearning_test.py b/bindings/python/cntk/contrib/deeprl/tests/tabular_qlearning_test.py new file mode 100644 index 000000000000..9bc5c0240487 --- /dev/null +++ b/bindings/python/cntk/contrib/deeprl/tests/tabular_qlearning_test.py @@ -0,0 +1,133 @@ +# Copyright (c) Microsoft. All rights reserved. + +# Licensed under the MIT license. See LICENSE.md file in the project root +# for full license information. +# ============================================================================== + +import unittest +from unittest.mock import patch + +import cntk.contrib.deeprl.tests.spaces as spaces +import numpy as np +from cntk.contrib.deeprl.agent.tabular_qlearning import TabularQLearning + + +class FakeTabularQLearning(TabularQLearning): + """Override TabularQLearning for unittest.""" + + def _choose_action(self, state): + """Fake epsilon greedy policy.""" + return state % 2, 'GREEDY' + + +class TabularQLearningTest(unittest.TestCase): + """Unit tests for TabularQLearning.""" + + def test_init(self): + # Discrete observation space. + action_space = spaces.Discrete(2) + observation_space = spaces.Discrete(3) + sut = TabularQLearning('', observation_space, action_space) + self.assertEqual(sut._num_actions, 2) + self.assertEqual(sut._num_states, 3) + self.assertEqual(sut._shape_of_inputs, (3, )) + self.assertTrue(sut._discrete_observation_space) + self.assertIsNone(sut._space_discretizer) + self.assertIsNone(sut._preprocessor) + + # Discretize observation space to default resolution. + observation_space = spaces.Box(0, 1, (2,)) + sut = TabularQLearning('', observation_space, action_space) + self.assertEqual(sut._num_states, 100) + self.assertEqual(sut._shape_of_inputs, (100, )) + self.assertTrue(sut._discrete_observation_space) + self.assertIsNotNone(sut._space_discretizer) + # Verify encoding of state + self.assertEqual(sut._discretize_state_if_necessary([0, 0]), 0) + self.assertEqual(sut._discretize_state_if_necessary([0.05, 0]), 0) + self.assertEqual(sut._discretize_state_if_necessary([0.95, 0]), 90) + self.assertEqual(sut._discretize_state_if_necessary([0, 0.05]), 0) + self.assertEqual(sut._discretize_state_if_necessary([0, 0.95]), 9) + self.assertEqual(sut._discretize_state_if_necessary([0.1, 0.2]), 12) + self.assertEqual(sut._discretize_state_if_necessary([1, 1]), 99) + + # Unsupported observation space for tabular representation + observation_space = spaces.MultiBinary(10) + self.assertRaises( + ValueError, TabularQLearning, '', observation_space, action_space) + + @patch('cntk.contrib.deeprl.agent.tabular_qlearning.QLearningParameters') + def test_init_unsupported_q(self, mock_qlearn_parameters): + mock_qlearn_parameters.return_value.q_representation = 'undefined' + + action_space = spaces.Discrete(2) + observation_space = spaces.Discrete(3) + self.assertRaises( + ValueError, TabularQLearning, '', observation_space, action_space) + + @patch('cntk.contrib.deeprl.agent.tabular_qlearning.QLearningParameters') + def test_update(self, mock_qlearn_parameters): + self._setup_qlearn_parameters(mock_qlearn_parameters.return_value) + action_space = spaces.Discrete(2) + observation_space = spaces.Discrete(3) + sut = FakeTabularQLearning('', observation_space, action_space) + + sut.start(0) + self.assertEqual(sut.episode_count, 1) + self.assertEqual(sut.step_count, 0) + self.assertEqual(sut._epsilon, 0.1) + # _eta has not been defined so far. + self.assertEqual(sut._last_state, 0) + self.assertEqual(sut._last_action, 0) + + sut.step(1, 1) + self.assertEqual(sut.episode_count, 1) + self.assertEqual(sut.step_count, 1) + self.assertEqual(sut._epsilon, 0.09) + self.assertEqual(sut._eta, 0.1) + self.assertEqual(sut._last_state, 1) + self.assertEqual(sut._last_action, 1) + np.testing.assert_array_equal( + sut._q, [[0.1, 0], [0, 0], [0, 0]]) + + sut.step(1, 1) + self.assertEqual(sut.episode_count, 1) + self.assertEqual(sut.step_count, 2) + self.assertEqual(sut._epsilon, 0.08) + self.assertEqual(sut._eta, 0.09) + self.assertEqual(sut._last_state, 1) + self.assertEqual(sut._last_action, 1) + np.testing.assert_array_equal( + sut._q, [[0.1, 0], [0, 0.09], [0, 0]]) + + sut.step(1, 1) + self.assertEqual(sut.episode_count, 1) + self.assertEqual(sut.step_count, 3) + self.assertEqual(sut._epsilon, 0.07) + self.assertEqual(sut._eta, 0.08) + self.assertEqual(sut._last_state, 1) + self.assertEqual(sut._last_action, 1) + # 0.16928 = 0.09 + (1(reward) + 0.9(gamma)*max([0, 0.09]) - 0.09) * 0.08(eta) + np.testing.assert_almost_equal( + sut._q, [[0.1, 0], [0, 0.16928], [0, 0]]) + + sut.end(1, 2) + self.assertEqual(sut.episode_count, 1) + self.assertEqual(sut.step_count, 4) + # _epsilon remains the same as no action is chosen in end(). + self.assertEqual(sut._epsilon, 0.07) + self.assertEqual(sut._eta, 0.07) + # 0.2274304 = 0.16928 + (1(reward) - 0.16928) * 0.07(eta) + np.testing.assert_almost_equal( + sut._q, [[0.1, 0], [0, 0.2274304], [0, 0]]) + + def _setup_qlearn_parameters(self, qlearn_parameters): + qlearn_parameters.q_representation = 'tabular' + qlearn_parameters.initial_q = 0 + qlearn_parameters.initial_epsilon = 0.1 + qlearn_parameters.epsilon_decay_step_count = 9 + qlearn_parameters.epsilon_minimum = 0.01 + qlearn_parameters.initial_eta = 0.1 + qlearn_parameters.eta_decay_step_count = 9 + qlearn_parameters.eta_minimum = 0.01 + qlearn_parameters.gamma = 0.9 diff --git a/bindings/python/cntk/ops/__init__.py b/bindings/python/cntk/ops/__init__.py index 6d712eddb523..545ffafa5e0c 100755 --- a/bindings/python/cntk/ops/__init__.py +++ b/bindings/python/cntk/ops/__init__.py @@ -1471,6 +1471,50 @@ def cos(x, name=''): x = sanitize_input(x) return cos(x, name) +@typemap +def sinh(x, name=''): + ''' + Computes the element-wise sinh of ``x``: + + The output tensor has the same shape as ``x``. + + Example: + >>> np.round(C.sinh([[1,0.5],[-0.25,-0.75]]).eval(),5) + array([[ 1.1752 , 0.5211 ], + [-0.25261, -0.82232]], dtype=float32) + + Args: + x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor + name (str, optional): the name of the Function instance in the network + Returns: + :class:`~cntk.ops.functions.Function` + ''' + from cntk.cntk_py import sinh + x = sanitize_input(x) + return sinh(x, name) + +@typemap +def cosh(x, name=''): + ''' + Computes the element-wise cosh of ``x``: + + The output tensor has the same shape as ``x``. + + Example: + >>> np.round(C.cosh([[1,0.5],[-0.25,-0.75]]).eval(),5) + array([[ 1.54308, 1.12763], + [ 1.03141, 1.29468]], dtype=float32) + + Args: + x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor + name (str, optional): the name of the Function instance in the network + Returns: + :class:`~cntk.ops.functions.Function` + ''' + from cntk.cntk_py import cosh + x = sanitize_input(x) + return cosh(x, name) + @typemap def softmax(x, axis=None, name=''):