diff --git a/README.md b/README.md
index 8742d27..8965f42 100644
--- a/README.md
+++ b/README.md
@@ -58,3 +58,5 @@ If you are not a member of the MeTaL team, feel free to submit pull requests and
 ## Code Structure
 ### Algorithms:
 This contains any code pertaining to learning/neural network algorithms and architectures.
+### Tests:
+Contains code for testing neural network architectures
diff --git a/algorithms/architectures.py b/algorithms/architectures.py
index a6cccd4..f0181e1 100644
--- a/algorithms/architectures.py
+++ b/algorithms/architectures.py
@@ -9,162 +9,166 @@
 import tensorflow as tf
 
 
-def FeedForward(_input, hparams, name="ffn"):
-	"""
-	Builds a Feed Forward NN with linear output
-
-	Args:
-		_input: Tensor of shape [None, input_size]
-		hparams: Dictionary of hyperparameters
-			'output_size': Dimensionality of output
-			'hidden_sizes': List of hidden layer sizes
-			'activations': List of activation functions for each layer
-	Returns:
-		Output tensor of shape [None, output_size]
-	@Authors: Arsh Zahed
-	"""
-
-	# We iteratively nest the layers
-	net = _input
-	hidden_sizes = hparams['hidden_sizes']
-	activations = hparams['activations']
-	with tf.variable_scope(name):
-		for i in range(len(activations)):
-			net = tf.layers.dense(net, hidden_sizes[i], activations[i])
-		# Call our prediction/policy y_hat. 
-		# Linear activation allows for logits
-		y_hat = tf.layers.dense(net, hparams['output_size'])
-
-	return y_hat
-
-
-def MakeRNNCell(rnn_layer_sizes,
-                dropout_keep_prob=1.0,
-                attn_length=0,
-                base_cell=tf.contrib.rnn.BasicLSTMCell,
-                residual_connections=False,
-                activation=tf.nn.tanh):
-	"""
-	Makes an RNN cell from the given hyperparameters. (From Magenta)
-
-	Args:
-		rnn_layer_sizes: A list of integer sizes (in units) for each layer of
-		    the RNN.
-		dropout_keep_prob: The float probability to keep the output of any
-		    given sub-cell.
-		attn_length: The size of the attention vector.
-		base_cell: The base tf.contrib.rnn.RNNCell to use for sub-cells.
-
-	Returns:
-		A tf.contrib.rnn.MultiRNNCell based on the given hyperparameters.
-	@Authors: Arsh Zahed
-	"""
-	cells = []
-	for i in range(len(rnn_layer_sizes)):
-		cell = base_cell(rnn_layer_sizes[i], activation=activation)
-		if attn_length and not cells:
-		  # Add attention wrapper to first layer.
-		  cell = tf.contrib.rnn.AttentionCellWrapper(
-		      cell, attn_length, state_is_tuple=True)
-		if residual_connections:
-		  cell = tf.contrib.rnn.ResidualWrapper(cell)
-		  if i == 0 or rnn_layer_sizes[i] != rnn_layer_sizes[i - 1]:
-		    cell = tf.contrib.rnn.InputProjectionWrapper(cell, rnn_layer_sizes[i])
-		cell = tf.contrib.rnn.DropoutWrapper(
-		    cell, output_keep_prob=dropout_keep_prob)
-		cells.append(cell)
-
-	cell = tf.contrib.rnn.MultiRNNCell(cells)
-
-	return cell
-
-
-def DynamicRNN(_input, hparams, initial_state=None, name="lstm"):
-	"""
-	Builds andand executes Dynamic RNN with specified activation
-
-	Args:
-		_input: Tensor of shape [None, total_time, input_size]
-		hparams: Dictionary of hyperparameters
-			'rnn_layer_sizes': List of RNN layer sizes
-			'dropout_keep_prob': Probability of not dropping
-			'attn_length': Integer length of attention
-			'base_cell': RNN Cell class from tf.contrib.rnn.*
-			'residual_connections': Boolean, True to have residuals
-			'activation': Output activation of RNN
-	Returns:
-		Outputs and states Tensors. Output Tensor of shape 
-			[None, total_time, rnn_layer_sizes[-1]]
-			State Tensor (tuples) match shapes specified in hyperparameters
-	@Authors: Arsh Zahed
-	"""
-	
-	# Set defaults if they dont exist in hparams
-	if 'dropout_keep_prob' not in hparams:
-		hparams['dropout_keep_prob'] = 1.0
-	if 'attn_length' not in hparams:
-		hparams['attn_length'] = 0
-	if 'base_cell' not in hparams:
-		hparams['base_cell'] = tf.contrib.rnn.BasicLSTMCell
-	if 'residual_connections' not in hparams:
-		hparams['residual_connections'] = False
-	if 'activation' not in hparams:
-		hparams['activation'] = tf.tanh
-
-	# Build RNN Cell
-	with tf.variable_scope(name):
-		rnn_cell = MakeRNNCell(hparams['rnn_layer_sizes'],
-			                   hparams['dropout_keep_prob'],
-			                   hparams['attn_length'],
-			                   hparams['base_cell'],
-			                   hparams['residual_connections'],
-			                   hparams['activation'])
-
-	outputs, states = tf.nn.dynamic_rnn(rnn_cell, _input, initial_state=initial_state,
-										dtype=_input.dtype)
-
-	return outputs, states
-
-
-def CNN(_input, hparams, name="cnn"):
-	"""
-	Builds a Convolutional Neural Network with a flattened output
-
-	Args:
-		_input: Tensor of shape [None, image_height, image_width, channels]
-		hparams: Dictionary of hyperparameters
-			'feature_maps': List of feature maps for each layer
-			'kernel_sizes': List of kernel sizes for each layer
-			'stride_lengths': List of strides for each layer
-			'padding_types': List of padding for each layer
-			'activations': List of activation functions for each layer
-	Returns:
-		Flattened output tensor of shape [None, output_size]
-	@Authors: Yi Liu
-	"""
-
-	net = _input
-	feature_maps = hparams['feature_maps']
-	kernel_sizes = hparams['kernel_sizes']
-	stride_lengths = hparams['stride_lengths']
-	padding_types = hparams['padding_types']
-	activations = hparams['activations']
-
-	with tf.variable_scope(name):
-		for i in range(len(activations)):
-			net = tf.layers.conv2d(
-				inputs=net,
-				filters=feature_maps[i],
-				kernel_size=kernel_sizes[i],
-				strides=stride_lengths[i],
-				padding=padding_types[i],
-				activation=activations[i]
-			)
-		# Flatten network
-		flat = tf.contrib.layers.flatten(net)
-	return flat
-
-
-def RCNN(self, _input, hparams, name='rcnn'):
-	# TODO
-	raise NotImplementedError('RCNN not implemented')
+
+def feed_forward(_input, hparams, name="ffn"):
+    """
+    Builds a Feed Forward NN with linear output
+
+    Args:
+        _input: Tensor of shape [None, input_size]
+        hparams: Dictionary of hyperparameters
+            'output_size': Dimensionality of output
+            'hidden_sizes': List of hidden layer sizes
+            'activations': List of activation functions for each layer
+        name: Variable scope name
+    Returns:
+        Output tensor of shape [None, output_size]
+    @Authors: Arsh Zahed
+    """
+
+    # Iteratively nest the layers
+    net = _input
+    hidden_sizes = hparams['hidden_sizes']
+    activations = hparams['activations']
+    with tf.variable_scope(name):
+        for i in range(len(activations)):
+            net = tf.layers.dense(net, hidden_sizes[i], activations[i])
+        # Call our prediction/policy y_hat.
+        # Linear activation allows for logits
+        y_hat = tf.layers.dense(net, hparams['output_size'])
+
+    return y_hat
+
+
+def make_rnn_cell(rnn_layer_sizes,
+                  dropout_keep_prob=1.0,
+                  attn_length=0,
+                  base_cell=tf.contrib.rnn.BasicLSTMCell,
+                  residual_connections=False,
+                  activation=tf.nn.tanh):
+    """
+    Makes an RNN cell from the given hyperparameters. (From Magenta)
+
+    Args:
+        rnn_layer_sizes: A list of integer sizes (in units) for each layer of
+            the RNN.
+        dropout_keep_prob: The float probability to keep the output of any
+            given sub-cell.
+        attn_length: The size of the attention vector.
+        base_cell: The base tf.contrib.rnn.RNNCell to use for sub-cells.
+    Returns:
+        A tf.contrib.rnn.MultiRNNCell based on the given hyperparameters.
+    @Authors: Arsh Zahed
+    """
+    cells = []
+    for i in range(len(rnn_layer_sizes)):
+        cell = base_cell(rnn_layer_sizes[i], activation=activation)
+        if attn_length and not cells:
+            # Add attention wrapper to first layer.
+            cell = tf.contrib.rnn.AttentionCellWrapper(
+                cell, attn_length, state_is_tuple=True)
+        if residual_connections:
+            cell = tf.contrib.rnn.ResidualWrapper(cell)
+            if i == 0 or rnn_layer_sizes[i] != rnn_layer_sizes[i - 1]:
+                cell = tf.contrib.rnn.InputProjectionWrapper(cell, rnn_layer_sizes[i])
+        cell = tf.contrib.rnn.DropoutWrapper(
+            cell, output_keep_prob=dropout_keep_prob)
+        cells.append(cell)
+
+    cell = tf.contrib.rnn.MultiRNNCell(cells)
+
+    return cell
+
+
+def dynamic_rnn(_input, hparams, initial_state=None, name="lstm"):
+    """
+    Builds andand executes Dynamic RNN with specified activation
+
+    Args:
+        _input: Tensor of shape [None, total_time, input_size]
+        hparams: Dictionary of hyperparameters
+            'rnn_layer_sizes': List of RNN layer sizes
+            'dropout_keep_prob': Probability of not dropping
+            'attn_length': Integer length of attention
+            'base_cell': RNN Cell class from tf.contrib.rnn.*
+            'residual_connections': Boolean, True to have residuals
+            'activation': Output activation of RNN
+        name: Variable scope name
+    Returns:
+        Outputs and states Tensors. Output Tensor of shape
+            [None, total_time, rnn_layer_sizes[-1]]
+            State Tensor (tuples) match shapes specified in hyperparameters
+    @Authors: Arsh Zahed
+    """
+
+    # Set defaults if they dont exist in hparams
+    if 'dropout_keep_prob' not in hparams:
+        hparams['dropout_keep_prob'] = 1.0
+    if 'attn_length' not in hparams:
+        hparams['attn_length'] = 0
+    if 'base_cell' not in hparams:
+        hparams['base_cell'] = tf.contrib.rnn.BasicLSTMCell
+    if 'residual_connections' not in hparams:
+        hparams['residual_connections'] = False
+    if 'activation' not in hparams:
+        hparams['activation'] = tf.tanh
+
+    # Build RNN Cell
+    with tf.variable_scope(name):
+        rnn_cell = make_rnn_cell(hparams['rnn_layer_sizes'],
+                                 hparams['dropout_keep_prob'],
+                                 hparams['attn_length'],
+                                 hparams['base_cell'],
+                                 hparams['residual_connections'],
+                                 hparams['activation'])
+
+    outputs, states = tf.nn.dynamic_rnn(rnn_cell, _input, initial_state=initial_state,
+                                        dtype=_input.dtype)
+
+    return outputs, states
+
+
+def cnn(_input, hparams, name="cnn"):
+    """
+    Builds a Convolutional Neural Network with a flattened output
+
+    Args:
+        _input: Tensor of shape [None, image_height, image_width, channels]
+        hparams: Dictionary of hyperparameters
+            'feature_maps': List of feature maps for each layer
+            'kernel_sizes': List of kernel sizes for each layer
+            'stride_lengths': List of strides for each layer
+            'padding_types': List of padding for each layer
+            'activations': List of activation functions for each layer
+        name: Variable scope name
+    Returns:
+        Flattened output tensor of shape [None, output_size]
+    @Authors: Yi Liu
+    """
+
+    net = _input
+    feature_maps = hparams['feature_maps']
+    kernel_sizes = hparams['kernel_sizes']
+    stride_lengths = hparams['stride_lengths']
+    padding_types = hparams['padding_types']
+    activations = hparams['activations']
+
+    with tf.variable_scope(name):
+        for i in range(len(activations)):
+            net = tf.layers.conv2d(
+                inputs=net,
+                filters=feature_maps[i],
+                kernel_size=kernel_sizes[i],
+                strides=stride_lengths[i],
+                padding=padding_types[i],
+                activation=activations[i]
+            )
+        # Flatten network
+        flat = tf.contrib.layers.flatten(net)
+    return flat
+
+
+def rcnn(_input, hparams, name='rcnn'):
+    # TODO
+    raise NotImplementedError('RCNN not implemented')
+
diff --git a/algorithms/policygrad.py b/algorithms/policygrad.py
index 3a10513..3329435 100644
--- a/algorithms/policygrad.py
+++ b/algorithms/policygrad.py
@@ -1,148 +1,59 @@
+"""
+Classes for policy gradient neural networks
+@Authors: Yi Liu
+"""
 import tensorflow as tf
-from algorithms import architectures
+from algorithms.architectures import feed_forward
 
 
-def discount_and_normalize_rewards(episode_rewards):
+class PGFFNetwork:
     """
-    Discounts and normalizes rewards from an episode
-
-    TODO: Review and finish
-    """
-
-    discounted_episode_rewards = np.zeros_like(episode_rewards)
-    cumulative = 0.0
-    for i in reversed(range(len(episode_rewards))):
-        cumulative = cumulative * gamma + episode_rewards[i]
-        discounted_episode_rewards[i] = cumulative
-    
-    mean = np.mean(discounted_episode_rewards)
-    std = np.std(discounted_episode_rewards)
-    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
-    
-    return discounted_episode_rewards
-
-
-def SampleGaussian(means, log_simga_sqs):
+    Creates a policy gradient feed forward neural network
+    @Authors: Yi Liu
     """
-    Differentiably samples from Guassian using reparameterization trick.
+    def __init__(self, sess, state_size, action_size, ff_hparams, lr, name='PGFFNetwork'):
+        self.lr = lr
+        self.sess = sess
 
-    Args:
-        means: Tensor of mean values
-        log_sigma_sqs: Tensor of the logarithms of the variances
-    Returs:
-        Tensor of sampled gussian
-    @Authors: Arsh Zahed
-    """
-    unit = tf.random_norm(means.shape, 0, 1)
-    with_var = tf.sqrt(tf.exp(log_simga_sqs)) * eps
-    return (with_var + means)
+        self.s = tf.placeholder(tf.float32, [None, state_size], "state")
+        self.a = tf.placeholder(tf.int32, [None, 1], "action")
+        self.r = tf.placeholder(tf.float32, [None, 1], "discounted_rewards")
 
-
-class PGFFNN:
-    """Policy Gradient agent with only FFNN parameters
-
-    Note: Minimal variance reduction, no natural gradient or trust-region optimization
-    """
-
-    def __init__(self, env, state_size, action_size, is_discrete, hparams):
+        with tf.variable_scope(name):
+            with tf.variable_scope('network'):
+                logits = feed_forward(self.s, ff_hparams)
+                # softmax layer to create probability array
+                self.outputs = tf.nn.softmax(logits)
+
+            with tf.variable_scope('training'):
+                one_hot = tf.one_hot(self.a, action_size)
+                cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=one_hot, logits=logits)
+                self.loss = tf.reduce_mean(cross_entropy * self.r)
+                self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
+
+    def train(self, sample_s, sample_a, sample_r):
         """
-        Builds the graph for Feed Forward NN Policy Gradient agent 
-
-        Args:
-            state_size: Integer size of state Tensor
-            action_size: Integer size of action Tensor
-            is_discrete: Boolean, True if discrete space, False if continuous
-            hparams: Dictionary of hyperparameters
-                'learning_rate': Learning rate
-                'output_size': Dimensionality of output
-                'hidden_sizes': List of hidden layer sizes
-                'activations': List of activation functions for each layer
+        Trains neural network
+        args:
+            sample_s: sample state vectors
+            sample_a: sample actions (integers)
+            sample_r: sample rewards (floats)
         Returns:
-            Output tensor of shape [None, output_size]
-        @Authors: Arsh Zahed
+            Error value for the sample batch
         """
-        self.env = env
-
-        self.input_ = tf.placeholder(tf.float32, [None, state_size], name="input_")
-        self.actions = tf.placeholder(tf.int32, [None, action_size], name="actions")
-        self.discounted_episode_rewards_ = tf.placeholder(tf.float32, [None,], 
-                                                     name="discounted_episode_rewards")
-
-        self.mean_reward_ = tf.placeholder(tf.float32 , name="mean_reward")
-        pre_distr = architectures.FeedForward(self.input_, hparams, name='ffn_policygrad')
-
-        if is_discrete:
-            self.action_distribution = tf.nn.softmax(pre_distr)
-
-            with tf.name_scope("loss"):
-                neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.action_distribution, 
-                                                                          labels = self.actions)
-                self.loss = tf.reduce_mean(neg_log_prob * self.discounted_episode_rewards_) 
-                tf.summary.scalar('loss', self.loss)
-        else:
-            # TODO
-            pass    
-
-        with tf.name_scope("train"):
-            self.train_opt = tf.train.AdamOptimizer(hparams['learning_rate']).minimize(self.loss)
-
-
-    def train(self, sess, num_ep ):
-        for episode in range(num_ep):
-        
-            episode_rewards_sum = 0
-
-            # Launch the game
-            state = self.env.reset()
-            
-            self.env.render()
-               
-            while True:
-                # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, 
-                # WE'RE OUTPUT PROBABILITIES.
-                action_probability_distribution = sess.run(self.action_distribution, 
-                                                           feed_dict={self.input_: state.reshape([1,4])})
-                # select action w.r.t the actions prob
-                action = np.random.choice(range(action_probability_distribution.shape[1]), 
-                                          p=action_probability_distribution.ravel())
-
-                # Perform a
-                new_state, reward, done, info = self.env.step(action)
-                # Store s, a, r
-                episode_states.append(state)
-
-                # For actions because we output only one (the index) 
-                # we need 2 (1 is for the action taken)
-                action_ = np.zeros(action_size)
-                action_[action] = 1
-                
-                episode_actions.append(action_)
-                episode_rewards.append(reward)
-
-                if done:
-                    # Calculate sum reward
-                    episode_rewards_sum = np.sum(episode_rewards)
-                    allRewards.append(episode_rewards_sum)
-                    total_rewards = np.sum(allRewards)
-                    # Mean reward
-                    mean_reward = np.divide(total_rewards, episode+1)
-                    maximumRewardRecorded = np.amax(allRewards)
-                    print("==========================================")
-                    print("Episode: ", episode)
-                    print("Reward: ", episode_rewards_sum)
-                    print("Mean Reward", mean_reward)
-                    print("Max reward so far: ", maximumRewardRecorded)
-
-                    # Calculate discounted reward
-                    discounted_episode_rewards = discount_and_normalize_rewards(episode_rewards)
-
-                    loss_, _ = sess.run(
-                        [self.loss, self.train_opt],
-                        feed_dict={self.input_: np.vstack(np.array(episode_states)),
-                        self.actions: np.vstack(np.array(episode_actions)),
-                        self.discounted_episode_rewards_: discounted_episode_rewards}
-                        )
+        feed_dict = {self.s: sample_s, self.a: sample_a, self.r: sample_r}
+        error, _ = self.sess.run([self.loss, self.train_op], feed_dict=feed_dict)
+        return error
 
+    def action_dist(self, state):
+        """
+        Outputs action distribution based on state
+        args:
+            state: current state vector
+        Returns:
+            Vector of action distributions
+        """
+        return self.sess.run(self.outputs, feed_dict={self.s: state})
 
 
 class PGLSTM:
@@ -153,30 +64,30 @@ class PGLSTM:
     def __init__(self):
         return
 
- 
 
 class PGConvNetwork:
     """
-    A basic network that performs convolutions and 
+    A basic network that performs convolutions. (Temporary!!)
     """
-    def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'):
+
+    def __init__(self, state_size, action_size, learning_rate, name='PGConvNetwork'):
         self.state_size = state_size
         self.action_size = action_size
         self.learning_rate = learning_rate
-        
+
         with tf.variable_scope(name):
             with tf.name_scope("inputs"):
                 # We create the placeholders
                 # *state_size means that we take each elements of state_size in tuple hence is like if we wrote
                 # [None, 84, 84, 4]
-                self.inputs_= tf.placeholder(tf.float32, [None, *state_size], name="inputs_")
+                self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs_")
                 self.actions = tf.placeholder(tf.int32, [None, action_size], name="actions")
-                self.discounted_episode_rewards_ = tf.placeholder(tf.float32, [None, ], name="discounted_episode_rewards_")
-            
-                
+                self.discounted_episode_rewards_ = tf.placeholder(tf.float32, [None, ],
+                                                                  name="discounted_episode_rewards_")
+
                 # Add this placeholder for having this variable in tensorboard
                 self.mean_reward_ = tf.placeholder(tf.float32, name="mean_reward")
-                
+
             with tf.name_scope("conv1"):
                 """
                 First convnet:
@@ -185,22 +96,22 @@ def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'):
                 ELU
                 """
                 # Input is 84x84x4
-                self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
-                                             filters = 32,
-                                             kernel_size = [8,8],
-                                             strides = [4,4],
-                                             padding = "VALID",
+                self.conv1 = tf.layers.conv2d(inputs=self.inputs_,
+                                              filters=32,
+                                              kernel_size=[8, 8],
+                                              strides=[4, 4],
+                                              padding="VALID",
                                               kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
-                                             name = "conv1")
+                                              name="conv1")
 
                 self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
-                                                       training = True,
-                                                       epsilon = 1e-5,
-                                                         name = 'batch_norm1')
+                                                                     training=True,
+                                                                     epsilon=1e-5,
+                                                                     name='batch_norm1')
 
                 self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out")
                 ## --> [20, 20, 32]
-            
+
             with tf.name_scope("conv2"):
                 """
                 Second convnet:
@@ -208,22 +119,22 @@ def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'):
                 BatchNormalization
                 ELU
                 """
-                self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
-                                     filters = 64,
-                                     kernel_size = [4,4],
-                                     strides = [2,2],
-                                     padding = "VALID",
-                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
-                                     name = "conv2")
+                self.conv2 = tf.layers.conv2d(inputs=self.conv1_out,
+                                              filters=64,
+                                              kernel_size=[4, 4],
+                                              strides=[2, 2],
+                                              padding="VALID",
+                                              kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
+                                              name="conv2")
 
                 self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
-                                                       training = True,
-                                                       epsilon = 1e-5,
-                                                         name = 'batch_norm2')
+                                                                     training=True,
+                                                                     epsilon=1e-5,
+                                                                     name='batch_norm2')
 
                 self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
                 ## --> [9, 9, 64]
-            
+
             with tf.name_scope("conv3"):
                 """
                 Third convnet:
@@ -231,50 +142,48 @@ def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'):
                 BatchNormalization
                 ELU
                 """
-                self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
-                                     filters = 128,
-                                     kernel_size = [4,4],
-                                     strides = [2,2],
-                                     padding = "VALID",
-                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
-                                     name = "conv3")
+                self.conv3 = tf.layers.conv2d(inputs=self.conv2_out,
+                                              filters=128,
+                                              kernel_size=[4, 4],
+                                              strides=[2, 2],
+                                              padding="VALID",
+                                              kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
+                                              name="conv3")
 
                 self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
-                                                       training = True,
-                                                       epsilon = 1e-5,
-                                                         name = 'batch_norm3')
+                                                                     training=True,
+                                                                     epsilon=1e-5,
+                                                                     name='batch_norm3')
 
                 self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
                 ## --> [3, 3, 128]
-            
+
             with tf.name_scope("flatten"):
                 self.flatten = tf.layers.flatten(self.conv3_out)
                 ## --> [1152]
-            
+
             with tf.name_scope("fc1"):
-                self.fc = tf.layers.dense(inputs = self.flatten,
-                                      units = 512,
-                                      activation = tf.nn.elu,
-                                           kernel_initializer=tf.contrib.layers.xavier_initializer(),
-                                    name="fc1")
-            
+                self.fc = tf.layers.dense(inputs=self.flatten,
+                                          units=512,
+                                          activation=tf.nn.elu,
+                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                          name="fc1")
+
             with tf.name_scope("logits"):
-                self.logits = tf.layers.dense(inputs = self.fc, 
-                                               kernel_initializer=tf.contrib.layers.xavier_initializer(),
-                                              units = 3, 
-                                            activation=None)
-            
+                self.logits = tf.layers.dense(inputs=self.fc,
+                                              kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                              units=3,
+                                              activation=None)
+
             with tf.name_scope("softmax"):
                 self.action_distribution = tf.nn.softmax(self.logits)
-                
 
             with tf.name_scope("loss"):
                 # tf.nn.softmax_cross_entropy_with_logits computes the cross entropy of the result after applying the softmax function
                 # If you have single-class labels, where an object can only belong to one class, you might now consider using 
                 # tf.nn.sparse_softmax_cross_entropy_with_logits so that you don't have to convert your labels to a dense one-hot array. 
-                self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.logits, labels = self.actions)
-                self.loss = tf.reduce_mean(self.neg_log_prob * self.discounted_episode_rewards_) 
-        
-    
+                self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.actions)
+                self.loss = tf.reduce_mean(self.neg_log_prob * self.discounted_episode_rewards_)
+
             with tf.name_scope("train"):
                 self.train_opt = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index aeedd32..e4d885b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 tensorflow >= 1.10
-gym == 0.7.4
\ No newline at end of file
+numpy >= 1.15.0
+gym >= 0.9.6
\ No newline at end of file
diff --git a/tests/pgffnetwork.py b/tests/pgffnetwork.py
new file mode 100644
index 0000000..c6f5d9c
--- /dev/null
+++ b/tests/pgffnetwork.py
@@ -0,0 +1,69 @@
+"""
+Tests PGFFNetwork on an environment
+
+@Authors: Yi Liu
+"""
+
+import gym
+import numpy as np
+import tensorflow as tf
+from algorithms.policygrad import PGFFNetwork
+
+# maximum number of iterations of environment
+n_max_iter = 1500
+# number of games played
+n_games = 1500
+discount_rate = 0.99
+
+env = gym.make('CartPole-v0')
+env._max_episode_steps = n_max_iter
+# environment observation size
+env_obs_n = 4
+# environment action size
+env_act_n = 2
+
+ff_hparams = {
+    'hidden_sizes': [30, 30],
+    'activations': [tf.nn.relu, tf.nn.relu],
+    'output_size': env_act_n
+}
+learning_rate = 0.001
+sess = tf.InteractiveSession()
+agent = PGFFNetwork(sess, env_obs_n, env_act_n, ff_hparams, learning_rate)
+tf.global_variables_initializer().run()
+
+for game in range(n_games):
+    obs = env.reset()
+    # store states, actions, and rewards
+    states = []
+    actions = []
+    rewards = []
+    for _ in range(n_max_iter):
+        action_dist = agent.action_dist(obs[np.newaxis, :])
+        action = np.random.choice(np.arange(env_act_n), p=np.squeeze(action_dist))
+        obs, reward, done, info = env.step(action)
+
+        states.append(obs)
+        actions.append(action)
+        rewards.append(reward)
+        if done:
+            break
+
+    # discount rewards
+    discounted_rewards = []
+    accumulated_reward = 0
+    for step in reversed(range(len(rewards))):
+        accumulated_reward = rewards[step] + accumulated_reward * discount_rate
+        discounted_rewards.insert(0, accumulated_reward)
+    # normalize discounted rewards
+    rewards_mean = np.mean(discounted_rewards)
+    rewards_std = np.std(discounted_rewards)
+    discounted_rewards = [(reward - rewards_mean) / rewards_std for reward in discounted_rewards]
+
+    # format actions and rewards to proper dimensions
+    actions = np.expand_dims(actions, axis=1)
+    discounted_rewards = np.expand_dims(discounted_rewards, axis=1)
+
+    # train agent
+    error = agent.train(states, actions, discounted_rewards)
+    print("Game: {}, Error: {}, Game Length: {}, Total Reward: {}".format(game, error, len(actions), sum(rewards)))