diff --git a/README.md b/README.md index 8742d27..8965f42 100644 --- a/README.md +++ b/README.md @@ -58,3 +58,5 @@ If you are not a member of the MeTaL team, feel free to submit pull requests and ## Code Structure ### Algorithms: This contains any code pertaining to learning/neural network algorithms and architectures. +### Tests: +Contains code for testing neural network architectures diff --git a/algorithms/architectures.py b/algorithms/architectures.py index a6cccd4..f0181e1 100644 --- a/algorithms/architectures.py +++ b/algorithms/architectures.py @@ -9,162 +9,166 @@ import tensorflow as tf -def FeedForward(_input, hparams, name="ffn"): - """ - Builds a Feed Forward NN with linear output - - Args: - _input: Tensor of shape [None, input_size] - hparams: Dictionary of hyperparameters - 'output_size': Dimensionality of output - 'hidden_sizes': List of hidden layer sizes - 'activations': List of activation functions for each layer - Returns: - Output tensor of shape [None, output_size] - @Authors: Arsh Zahed - """ - - # We iteratively nest the layers - net = _input - hidden_sizes = hparams['hidden_sizes'] - activations = hparams['activations'] - with tf.variable_scope(name): - for i in range(len(activations)): - net = tf.layers.dense(net, hidden_sizes[i], activations[i]) - # Call our prediction/policy y_hat. - # Linear activation allows for logits - y_hat = tf.layers.dense(net, hparams['output_size']) - - return y_hat - - -def MakeRNNCell(rnn_layer_sizes, - dropout_keep_prob=1.0, - attn_length=0, - base_cell=tf.contrib.rnn.BasicLSTMCell, - residual_connections=False, - activation=tf.nn.tanh): - """ - Makes an RNN cell from the given hyperparameters. (From Magenta) - - Args: - rnn_layer_sizes: A list of integer sizes (in units) for each layer of - the RNN. - dropout_keep_prob: The float probability to keep the output of any - given sub-cell. - attn_length: The size of the attention vector. - base_cell: The base tf.contrib.rnn.RNNCell to use for sub-cells. - - Returns: - A tf.contrib.rnn.MultiRNNCell based on the given hyperparameters. - @Authors: Arsh Zahed - """ - cells = [] - for i in range(len(rnn_layer_sizes)): - cell = base_cell(rnn_layer_sizes[i], activation=activation) - if attn_length and not cells: - # Add attention wrapper to first layer. - cell = tf.contrib.rnn.AttentionCellWrapper( - cell, attn_length, state_is_tuple=True) - if residual_connections: - cell = tf.contrib.rnn.ResidualWrapper(cell) - if i == 0 or rnn_layer_sizes[i] != rnn_layer_sizes[i - 1]: - cell = tf.contrib.rnn.InputProjectionWrapper(cell, rnn_layer_sizes[i]) - cell = tf.contrib.rnn.DropoutWrapper( - cell, output_keep_prob=dropout_keep_prob) - cells.append(cell) - - cell = tf.contrib.rnn.MultiRNNCell(cells) - - return cell - - -def DynamicRNN(_input, hparams, initial_state=None, name="lstm"): - """ - Builds andand executes Dynamic RNN with specified activation - - Args: - _input: Tensor of shape [None, total_time, input_size] - hparams: Dictionary of hyperparameters - 'rnn_layer_sizes': List of RNN layer sizes - 'dropout_keep_prob': Probability of not dropping - 'attn_length': Integer length of attention - 'base_cell': RNN Cell class from tf.contrib.rnn.* - 'residual_connections': Boolean, True to have residuals - 'activation': Output activation of RNN - Returns: - Outputs and states Tensors. Output Tensor of shape - [None, total_time, rnn_layer_sizes[-1]] - State Tensor (tuples) match shapes specified in hyperparameters - @Authors: Arsh Zahed - """ - - # Set defaults if they dont exist in hparams - if 'dropout_keep_prob' not in hparams: - hparams['dropout_keep_prob'] = 1.0 - if 'attn_length' not in hparams: - hparams['attn_length'] = 0 - if 'base_cell' not in hparams: - hparams['base_cell'] = tf.contrib.rnn.BasicLSTMCell - if 'residual_connections' not in hparams: - hparams['residual_connections'] = False - if 'activation' not in hparams: - hparams['activation'] = tf.tanh - - # Build RNN Cell - with tf.variable_scope(name): - rnn_cell = MakeRNNCell(hparams['rnn_layer_sizes'], - hparams['dropout_keep_prob'], - hparams['attn_length'], - hparams['base_cell'], - hparams['residual_connections'], - hparams['activation']) - - outputs, states = tf.nn.dynamic_rnn(rnn_cell, _input, initial_state=initial_state, - dtype=_input.dtype) - - return outputs, states - - -def CNN(_input, hparams, name="cnn"): - """ - Builds a Convolutional Neural Network with a flattened output - - Args: - _input: Tensor of shape [None, image_height, image_width, channels] - hparams: Dictionary of hyperparameters - 'feature_maps': List of feature maps for each layer - 'kernel_sizes': List of kernel sizes for each layer - 'stride_lengths': List of strides for each layer - 'padding_types': List of padding for each layer - 'activations': List of activation functions for each layer - Returns: - Flattened output tensor of shape [None, output_size] - @Authors: Yi Liu - """ - - net = _input - feature_maps = hparams['feature_maps'] - kernel_sizes = hparams['kernel_sizes'] - stride_lengths = hparams['stride_lengths'] - padding_types = hparams['padding_types'] - activations = hparams['activations'] - - with tf.variable_scope(name): - for i in range(len(activations)): - net = tf.layers.conv2d( - inputs=net, - filters=feature_maps[i], - kernel_size=kernel_sizes[i], - strides=stride_lengths[i], - padding=padding_types[i], - activation=activations[i] - ) - # Flatten network - flat = tf.contrib.layers.flatten(net) - return flat - - -def RCNN(self, _input, hparams, name='rcnn'): - # TODO - raise NotImplementedError('RCNN not implemented') + +def feed_forward(_input, hparams, name="ffn"): + """ + Builds a Feed Forward NN with linear output + + Args: + _input: Tensor of shape [None, input_size] + hparams: Dictionary of hyperparameters + 'output_size': Dimensionality of output + 'hidden_sizes': List of hidden layer sizes + 'activations': List of activation functions for each layer + name: Variable scope name + Returns: + Output tensor of shape [None, output_size] + @Authors: Arsh Zahed + """ + + # Iteratively nest the layers + net = _input + hidden_sizes = hparams['hidden_sizes'] + activations = hparams['activations'] + with tf.variable_scope(name): + for i in range(len(activations)): + net = tf.layers.dense(net, hidden_sizes[i], activations[i]) + # Call our prediction/policy y_hat. + # Linear activation allows for logits + y_hat = tf.layers.dense(net, hparams['output_size']) + + return y_hat + + +def make_rnn_cell(rnn_layer_sizes, + dropout_keep_prob=1.0, + attn_length=0, + base_cell=tf.contrib.rnn.BasicLSTMCell, + residual_connections=False, + activation=tf.nn.tanh): + """ + Makes an RNN cell from the given hyperparameters. (From Magenta) + + Args: + rnn_layer_sizes: A list of integer sizes (in units) for each layer of + the RNN. + dropout_keep_prob: The float probability to keep the output of any + given sub-cell. + attn_length: The size of the attention vector. + base_cell: The base tf.contrib.rnn.RNNCell to use for sub-cells. + Returns: + A tf.contrib.rnn.MultiRNNCell based on the given hyperparameters. + @Authors: Arsh Zahed + """ + cells = [] + for i in range(len(rnn_layer_sizes)): + cell = base_cell(rnn_layer_sizes[i], activation=activation) + if attn_length and not cells: + # Add attention wrapper to first layer. + cell = tf.contrib.rnn.AttentionCellWrapper( + cell, attn_length, state_is_tuple=True) + if residual_connections: + cell = tf.contrib.rnn.ResidualWrapper(cell) + if i == 0 or rnn_layer_sizes[i] != rnn_layer_sizes[i - 1]: + cell = tf.contrib.rnn.InputProjectionWrapper(cell, rnn_layer_sizes[i]) + cell = tf.contrib.rnn.DropoutWrapper( + cell, output_keep_prob=dropout_keep_prob) + cells.append(cell) + + cell = tf.contrib.rnn.MultiRNNCell(cells) + + return cell + + +def dynamic_rnn(_input, hparams, initial_state=None, name="lstm"): + """ + Builds andand executes Dynamic RNN with specified activation + + Args: + _input: Tensor of shape [None, total_time, input_size] + hparams: Dictionary of hyperparameters + 'rnn_layer_sizes': List of RNN layer sizes + 'dropout_keep_prob': Probability of not dropping + 'attn_length': Integer length of attention + 'base_cell': RNN Cell class from tf.contrib.rnn.* + 'residual_connections': Boolean, True to have residuals + 'activation': Output activation of RNN + name: Variable scope name + Returns: + Outputs and states Tensors. Output Tensor of shape + [None, total_time, rnn_layer_sizes[-1]] + State Tensor (tuples) match shapes specified in hyperparameters + @Authors: Arsh Zahed + """ + + # Set defaults if they dont exist in hparams + if 'dropout_keep_prob' not in hparams: + hparams['dropout_keep_prob'] = 1.0 + if 'attn_length' not in hparams: + hparams['attn_length'] = 0 + if 'base_cell' not in hparams: + hparams['base_cell'] = tf.contrib.rnn.BasicLSTMCell + if 'residual_connections' not in hparams: + hparams['residual_connections'] = False + if 'activation' not in hparams: + hparams['activation'] = tf.tanh + + # Build RNN Cell + with tf.variable_scope(name): + rnn_cell = make_rnn_cell(hparams['rnn_layer_sizes'], + hparams['dropout_keep_prob'], + hparams['attn_length'], + hparams['base_cell'], + hparams['residual_connections'], + hparams['activation']) + + outputs, states = tf.nn.dynamic_rnn(rnn_cell, _input, initial_state=initial_state, + dtype=_input.dtype) + + return outputs, states + + +def cnn(_input, hparams, name="cnn"): + """ + Builds a Convolutional Neural Network with a flattened output + + Args: + _input: Tensor of shape [None, image_height, image_width, channels] + hparams: Dictionary of hyperparameters + 'feature_maps': List of feature maps for each layer + 'kernel_sizes': List of kernel sizes for each layer + 'stride_lengths': List of strides for each layer + 'padding_types': List of padding for each layer + 'activations': List of activation functions for each layer + name: Variable scope name + Returns: + Flattened output tensor of shape [None, output_size] + @Authors: Yi Liu + """ + + net = _input + feature_maps = hparams['feature_maps'] + kernel_sizes = hparams['kernel_sizes'] + stride_lengths = hparams['stride_lengths'] + padding_types = hparams['padding_types'] + activations = hparams['activations'] + + with tf.variable_scope(name): + for i in range(len(activations)): + net = tf.layers.conv2d( + inputs=net, + filters=feature_maps[i], + kernel_size=kernel_sizes[i], + strides=stride_lengths[i], + padding=padding_types[i], + activation=activations[i] + ) + # Flatten network + flat = tf.contrib.layers.flatten(net) + return flat + + +def rcnn(_input, hparams, name='rcnn'): + # TODO + raise NotImplementedError('RCNN not implemented') + diff --git a/algorithms/policygrad.py b/algorithms/policygrad.py index 3a10513..3329435 100644 --- a/algorithms/policygrad.py +++ b/algorithms/policygrad.py @@ -1,148 +1,59 @@ +""" +Classes for policy gradient neural networks +@Authors: Yi Liu +""" import tensorflow as tf -from algorithms import architectures +from algorithms.architectures import feed_forward -def discount_and_normalize_rewards(episode_rewards): +class PGFFNetwork: """ - Discounts and normalizes rewards from an episode - - TODO: Review and finish - """ - - discounted_episode_rewards = np.zeros_like(episode_rewards) - cumulative = 0.0 - for i in reversed(range(len(episode_rewards))): - cumulative = cumulative * gamma + episode_rewards[i] - discounted_episode_rewards[i] = cumulative - - mean = np.mean(discounted_episode_rewards) - std = np.std(discounted_episode_rewards) - discounted_episode_rewards = (discounted_episode_rewards - mean) / (std) - - return discounted_episode_rewards - - -def SampleGaussian(means, log_simga_sqs): + Creates a policy gradient feed forward neural network + @Authors: Yi Liu """ - Differentiably samples from Guassian using reparameterization trick. + def __init__(self, sess, state_size, action_size, ff_hparams, lr, name='PGFFNetwork'): + self.lr = lr + self.sess = sess - Args: - means: Tensor of mean values - log_sigma_sqs: Tensor of the logarithms of the variances - Returs: - Tensor of sampled gussian - @Authors: Arsh Zahed - """ - unit = tf.random_norm(means.shape, 0, 1) - with_var = tf.sqrt(tf.exp(log_simga_sqs)) * eps - return (with_var + means) + self.s = tf.placeholder(tf.float32, [None, state_size], "state") + self.a = tf.placeholder(tf.int32, [None, 1], "action") + self.r = tf.placeholder(tf.float32, [None, 1], "discounted_rewards") - -class PGFFNN: - """Policy Gradient agent with only FFNN parameters - - Note: Minimal variance reduction, no natural gradient or trust-region optimization - """ - - def __init__(self, env, state_size, action_size, is_discrete, hparams): + with tf.variable_scope(name): + with tf.variable_scope('network'): + logits = feed_forward(self.s, ff_hparams) + # softmax layer to create probability array + self.outputs = tf.nn.softmax(logits) + + with tf.variable_scope('training'): + one_hot = tf.one_hot(self.a, action_size) + cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=one_hot, logits=logits) + self.loss = tf.reduce_mean(cross_entropy * self.r) + self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss) + + def train(self, sample_s, sample_a, sample_r): """ - Builds the graph for Feed Forward NN Policy Gradient agent - - Args: - state_size: Integer size of state Tensor - action_size: Integer size of action Tensor - is_discrete: Boolean, True if discrete space, False if continuous - hparams: Dictionary of hyperparameters - 'learning_rate': Learning rate - 'output_size': Dimensionality of output - 'hidden_sizes': List of hidden layer sizes - 'activations': List of activation functions for each layer + Trains neural network + args: + sample_s: sample state vectors + sample_a: sample actions (integers) + sample_r: sample rewards (floats) Returns: - Output tensor of shape [None, output_size] - @Authors: Arsh Zahed + Error value for the sample batch """ - self.env = env - - self.input_ = tf.placeholder(tf.float32, [None, state_size], name="input_") - self.actions = tf.placeholder(tf.int32, [None, action_size], name="actions") - self.discounted_episode_rewards_ = tf.placeholder(tf.float32, [None,], - name="discounted_episode_rewards") - - self.mean_reward_ = tf.placeholder(tf.float32 , name="mean_reward") - pre_distr = architectures.FeedForward(self.input_, hparams, name='ffn_policygrad') - - if is_discrete: - self.action_distribution = tf.nn.softmax(pre_distr) - - with tf.name_scope("loss"): - neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.action_distribution, - labels = self.actions) - self.loss = tf.reduce_mean(neg_log_prob * self.discounted_episode_rewards_) - tf.summary.scalar('loss', self.loss) - else: - # TODO - pass - - with tf.name_scope("train"): - self.train_opt = tf.train.AdamOptimizer(hparams['learning_rate']).minimize(self.loss) - - - def train(self, sess, num_ep ): - for episode in range(num_ep): - - episode_rewards_sum = 0 - - # Launch the game - state = self.env.reset() - - self.env.render() - - while True: - # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, - # WE'RE OUTPUT PROBABILITIES. - action_probability_distribution = sess.run(self.action_distribution, - feed_dict={self.input_: state.reshape([1,4])}) - # select action w.r.t the actions prob - action = np.random.choice(range(action_probability_distribution.shape[1]), - p=action_probability_distribution.ravel()) - - # Perform a - new_state, reward, done, info = self.env.step(action) - # Store s, a, r - episode_states.append(state) - - # For actions because we output only one (the index) - # we need 2 (1 is for the action taken) - action_ = np.zeros(action_size) - action_[action] = 1 - - episode_actions.append(action_) - episode_rewards.append(reward) - - if done: - # Calculate sum reward - episode_rewards_sum = np.sum(episode_rewards) - allRewards.append(episode_rewards_sum) - total_rewards = np.sum(allRewards) - # Mean reward - mean_reward = np.divide(total_rewards, episode+1) - maximumRewardRecorded = np.amax(allRewards) - print("==========================================") - print("Episode: ", episode) - print("Reward: ", episode_rewards_sum) - print("Mean Reward", mean_reward) - print("Max reward so far: ", maximumRewardRecorded) - - # Calculate discounted reward - discounted_episode_rewards = discount_and_normalize_rewards(episode_rewards) - - loss_, _ = sess.run( - [self.loss, self.train_opt], - feed_dict={self.input_: np.vstack(np.array(episode_states)), - self.actions: np.vstack(np.array(episode_actions)), - self.discounted_episode_rewards_: discounted_episode_rewards} - ) + feed_dict = {self.s: sample_s, self.a: sample_a, self.r: sample_r} + error, _ = self.sess.run([self.loss, self.train_op], feed_dict=feed_dict) + return error + def action_dist(self, state): + """ + Outputs action distribution based on state + args: + state: current state vector + Returns: + Vector of action distributions + """ + return self.sess.run(self.outputs, feed_dict={self.s: state}) class PGLSTM: @@ -153,30 +64,30 @@ class PGLSTM: def __init__(self): return - class PGConvNetwork: """ - A basic network that performs convolutions and + A basic network that performs convolutions. (Temporary!!) """ - def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'): + + def __init__(self, state_size, action_size, learning_rate, name='PGConvNetwork'): self.state_size = state_size self.action_size = action_size self.learning_rate = learning_rate - + with tf.variable_scope(name): with tf.name_scope("inputs"): # We create the placeholders # *state_size means that we take each elements of state_size in tuple hence is like if we wrote # [None, 84, 84, 4] - self.inputs_= tf.placeholder(tf.float32, [None, *state_size], name="inputs_") + self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs_") self.actions = tf.placeholder(tf.int32, [None, action_size], name="actions") - self.discounted_episode_rewards_ = tf.placeholder(tf.float32, [None, ], name="discounted_episode_rewards_") - - + self.discounted_episode_rewards_ = tf.placeholder(tf.float32, [None, ], + name="discounted_episode_rewards_") + # Add this placeholder for having this variable in tensorboard self.mean_reward_ = tf.placeholder(tf.float32, name="mean_reward") - + with tf.name_scope("conv1"): """ First convnet: @@ -185,22 +96,22 @@ def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'): ELU """ # Input is 84x84x4 - self.conv1 = tf.layers.conv2d(inputs = self.inputs_, - filters = 32, - kernel_size = [8,8], - strides = [4,4], - padding = "VALID", + self.conv1 = tf.layers.conv2d(inputs=self.inputs_, + filters=32, + kernel_size=[8, 8], + strides=[4, 4], + padding="VALID", kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), - name = "conv1") + name="conv1") self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1, - training = True, - epsilon = 1e-5, - name = 'batch_norm1') + training=True, + epsilon=1e-5, + name='batch_norm1') self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out") ## --> [20, 20, 32] - + with tf.name_scope("conv2"): """ Second convnet: @@ -208,22 +119,22 @@ def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'): BatchNormalization ELU """ - self.conv2 = tf.layers.conv2d(inputs = self.conv1_out, - filters = 64, - kernel_size = [4,4], - strides = [2,2], - padding = "VALID", - kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), - name = "conv2") + self.conv2 = tf.layers.conv2d(inputs=self.conv1_out, + filters=64, + kernel_size=[4, 4], + strides=[2, 2], + padding="VALID", + kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), + name="conv2") self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2, - training = True, - epsilon = 1e-5, - name = 'batch_norm2') + training=True, + epsilon=1e-5, + name='batch_norm2') self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out") ## --> [9, 9, 64] - + with tf.name_scope("conv3"): """ Third convnet: @@ -231,50 +142,48 @@ def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'): BatchNormalization ELU """ - self.conv3 = tf.layers.conv2d(inputs = self.conv2_out, - filters = 128, - kernel_size = [4,4], - strides = [2,2], - padding = "VALID", - kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), - name = "conv3") + self.conv3 = tf.layers.conv2d(inputs=self.conv2_out, + filters=128, + kernel_size=[4, 4], + strides=[2, 2], + padding="VALID", + kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), + name="conv3") self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3, - training = True, - epsilon = 1e-5, - name = 'batch_norm3') + training=True, + epsilon=1e-5, + name='batch_norm3') self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out") ## --> [3, 3, 128] - + with tf.name_scope("flatten"): self.flatten = tf.layers.flatten(self.conv3_out) ## --> [1152] - + with tf.name_scope("fc1"): - self.fc = tf.layers.dense(inputs = self.flatten, - units = 512, - activation = tf.nn.elu, - kernel_initializer=tf.contrib.layers.xavier_initializer(), - name="fc1") - + self.fc = tf.layers.dense(inputs=self.flatten, + units=512, + activation=tf.nn.elu, + kernel_initializer=tf.contrib.layers.xavier_initializer(), + name="fc1") + with tf.name_scope("logits"): - self.logits = tf.layers.dense(inputs = self.fc, - kernel_initializer=tf.contrib.layers.xavier_initializer(), - units = 3, - activation=None) - + self.logits = tf.layers.dense(inputs=self.fc, + kernel_initializer=tf.contrib.layers.xavier_initializer(), + units=3, + activation=None) + with tf.name_scope("softmax"): self.action_distribution = tf.nn.softmax(self.logits) - with tf.name_scope("loss"): # tf.nn.softmax_cross_entropy_with_logits computes the cross entropy of the result after applying the softmax function # If you have single-class labels, where an object can only belong to one class, you might now consider using # tf.nn.sparse_softmax_cross_entropy_with_logits so that you don't have to convert your labels to a dense one-hot array. - self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.logits, labels = self.actions) - self.loss = tf.reduce_mean(self.neg_log_prob * self.discounted_episode_rewards_) - - + self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.actions) + self.loss = tf.reduce_mean(self.neg_log_prob * self.discounted_episode_rewards_) + with tf.name_scope("train"): self.train_opt = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index aeedd32..e4d885b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ tensorflow >= 1.10 -gym == 0.7.4 \ No newline at end of file +numpy >= 1.15.0 +gym >= 0.9.6 \ No newline at end of file diff --git a/tests/pgffnetwork.py b/tests/pgffnetwork.py new file mode 100644 index 0000000..c6f5d9c --- /dev/null +++ b/tests/pgffnetwork.py @@ -0,0 +1,69 @@ +""" +Tests PGFFNetwork on an environment + +@Authors: Yi Liu +""" + +import gym +import numpy as np +import tensorflow as tf +from algorithms.policygrad import PGFFNetwork + +# maximum number of iterations of environment +n_max_iter = 1500 +# number of games played +n_games = 1500 +discount_rate = 0.99 + +env = gym.make('CartPole-v0') +env._max_episode_steps = n_max_iter +# environment observation size +env_obs_n = 4 +# environment action size +env_act_n = 2 + +ff_hparams = { + 'hidden_sizes': [30, 30], + 'activations': [tf.nn.relu, tf.nn.relu], + 'output_size': env_act_n +} +learning_rate = 0.001 +sess = tf.InteractiveSession() +agent = PGFFNetwork(sess, env_obs_n, env_act_n, ff_hparams, learning_rate) +tf.global_variables_initializer().run() + +for game in range(n_games): + obs = env.reset() + # store states, actions, and rewards + states = [] + actions = [] + rewards = [] + for _ in range(n_max_iter): + action_dist = agent.action_dist(obs[np.newaxis, :]) + action = np.random.choice(np.arange(env_act_n), p=np.squeeze(action_dist)) + obs, reward, done, info = env.step(action) + + states.append(obs) + actions.append(action) + rewards.append(reward) + if done: + break + + # discount rewards + discounted_rewards = [] + accumulated_reward = 0 + for step in reversed(range(len(rewards))): + accumulated_reward = rewards[step] + accumulated_reward * discount_rate + discounted_rewards.insert(0, accumulated_reward) + # normalize discounted rewards + rewards_mean = np.mean(discounted_rewards) + rewards_std = np.std(discounted_rewards) + discounted_rewards = [(reward - rewards_mean) / rewards_std for reward in discounted_rewards] + + # format actions and rewards to proper dimensions + actions = np.expand_dims(actions, axis=1) + discounted_rewards = np.expand_dims(discounted_rewards, axis=1) + + # train agent + error = agent.train(states, actions, discounted_rewards) + print("Game: {}, Error: {}, Game Length: {}, Total Reward: {}".format(game, error, len(actions), sum(rewards)))