Remove stds (#55)

* remove stds from return Co-authored-by: Alyssa Morrow <akmorrow@s122.Millennium.Berkeley.EDU>
YosefLab · Jan 28, 2021 · 4dc4856 · 4dc4856
1 parent a0f7cc6
commit 4dc4856
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 111 deletions.
diff --git a/epitome/functions.py b/epitome/functions.py
@@ -182,7 +182,6 @@ def pyranges2Vector(pr1, pr2):
     prs = [(pr2, pr1, False), (pr1, pr2, True)]
     pool = multiprocessing.Pool(processes=2)
     results = pool.map(pyranges_intersect, prs)
-
     pool.close()
     pool.join()
 

diff --git a/epitome/models.py b/epitome/models.py
@@ -14,7 +14,6 @@
 
 from epitome import *
 import tensorflow as tf
-import tensorflow_probability as tfp
 
 from .functions import *
 from .constants import *
@@ -37,7 +36,6 @@
 class VariationalPeakModel():
     """
     Model for learning from ChIP-seq peaks.
-    Modeled from `this Bayesian Neural Network <https://github.com/tensorflow/probability/blob/master/tensorflow_probability/examples/bayesian_neural_network.py>`_.
     """
 
     def __init__(self,
@@ -129,29 +127,6 @@ def __init__(self,
         self.debug = debug
         self.model = self.create_model()
 
-    def get_weight_parameters(self):
-        '''
-        Extracts weight posterior statistics for layers with weight distributions.
-
-        :param keras model model: keras model
-        :return: triple of layer names, weight means for each layer and stddev for each layer.
-        :rtype: tuple
-        '''
-
-        names = []
-        qmeans = []
-        qstds = []
-        for i, layer in enumerate(self.model.layers):
-            try:
-                q = layer.kernel_posterior
-            except AttributeError:
-                continue
-            names.append("Layer {}".format(i))
-            qmeans.append(q.mean())
-            qstds.append(q.stddev())
-
-        return (names, qmeans, qstds)
-
     def save(self, checkpoint_path):
         '''
         Saves model.
@@ -245,24 +220,20 @@ def train_step(f):
             with tf.GradientTape() as tape:
 
                 logits = self.model(features, training=True)
-                kl_loss = tf.reduce_sum(self.model.losses)
-                neg_log_likelihood = self.loss_fn(labels, logits, weights)
-                elbo_loss = neg_log_likelihood + kl_loss
+                loss = self.loss_fn(labels, logits, weights)
 
-            gradients = tape.gradient(elbo_loss, self.model.trainable_variables)
+            gradients = tape.gradient(loss, self.model.trainable_variables)
             self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
 
-            return elbo_loss, neg_log_likelihood, kl_loss
+            return loss
 
-        @tf.function
         def loopiter():
             for step, f in enumerate(self.train_iter):
                 loss = train_step(f)
 
-                if step % 100 == 0:
-                  tf.compat.v1.logging.info(str(step) + " " + str(tf.reduce_mean(loss[0])) +
-                                            str(tf.reduce_mean(loss[1])) +
-                                            str(tf.reduce_mean(loss[2])))
+                if step % 1000 == 0:
+                  tf.print("Step: ", step)
+                  tf.print("\tLoss: ", tf.reduce_mean(loss))
 
                   if (self.debug):
                       tf.compat.v1.logging.info("On validation")
@@ -308,7 +279,7 @@ def test_from_generator(self, num_samples, ds, calculate_metrics=True):
 
         return self.run_predictions(num_samples, ds, calculate_metrics)
 
-    def eval_vector(self, matrix, indices, samples = 50):
+    def eval_vector(self, matrix, indices):
         '''
         Evaluates a new cell type based on its chromatin (DNase or ATAC-seq) vector, as well
         as any other similarity targets (acetylation, methylation, etc.). len(vector) should equal
@@ -337,15 +308,15 @@ def eval_vector(self, matrix, indices, samples = 50):
 
         results = self.run_predictions(num_samples, ds, calculate_metrics = False)
 
-        return results['preds_mean'], results['preds_std']
+        return results['preds']
 
     @tf.function
     def predict_step_matrix(self, inputs):
         """
         Runs predictions on numpy matrix from _predict
 
         :param np.ndarray inputs: numpy matrix of examples x features
-        :return: mean of predictions
+        :return: predictions
         :rtype: np.ndarray
         """
 
@@ -357,7 +328,7 @@ def predict_step_matrix(self, inputs):
         split_inputs = tf.split(tf.dtypes.cast(inputs, tf.float32), cell_lens, axis=1)
 
         # predict
-        return predict_step_generator(split_inputs, samples=1)[0]
+        return self.predict_step_generator(split_inputs)
 
     def _predict(self, numpy_matrix):
         '''
@@ -372,33 +343,23 @@ def _predict(self, numpy_matrix):
         return self.predict_step_matrix(numpy_matrix)
 
     @tf.function
-    def predict_step_generator(self, inputs_b, samples = 50):
+    def predict_step_generator(self, inputs_b):
         '''
         Runs predictions on inputs from run_predictions
 
         :param tf.Tensor inputs_b: batch of input data
-        :param int samples: Number of samples to test. Defaults to 50.
-        :return: mean and standard deviations of predictions
+        :return: predictions
         :rtype: tuple
         '''
+        return tf.sigmoid(self.model(inputs_b))
 
-        # sample n times by tiling batch by rows, running
-        # predictions for each row
-        inputs_tiled = [tf.tile(i, (samples, 1)) for i in inputs_b]
-        tmp = self.model(inputs_tiled)
-        y_pred = tf.sigmoid(tmp)
-        # split up batches into a third dimension and stack them in third dimension
-        preds = tf.stack(tf.split(y_pred, samples, axis=0), axis=0)
-        return tf.math.reduce_mean(preds, axis=0), tf.math.reduce_std(preds, axis=0)
-
-    def run_predictions(self, num_samples, iter_, calculate_metrics = True, samples = 50):
+    def run_predictions(self, num_samples, iter_, calculate_metrics = True):
         '''
         Runs predictions on num_samples records
 
         :param int num_samples: number of samples to test
         :param tf.dataset iter_: output of self.sess.run(generator_to_one_shot_iterator()), handle to one shot iterator of records
         :param bool calculate_metrics: whether to return auROC/auPR
-        :param int samples: number of times to sample from network
 
         :return: dict of preds, truth, target_dict, auROC, auPRC, False
             preds = predictions,
@@ -414,36 +375,28 @@ def run_predictions(self, num_samples, iter_, calculate_metrics = True, samples
 
         # empty arrays for concatenation
         truth = []
-        preds_mean = []
-        preds_std = []
+        preds = []
         sample_weight = []
 
         for f in tqdm.tqdm(iter_.take(batches)):
             inputs_b = f[:-2]
             truth_b = f[-2]
             weights_b = f[-1]
-            # Calculate epistemic uncertainty for batch by iterating over a certain number of times,
-            # getting y_preds. You can then calculate the mean and sigma of the predictions,
-            # and use this to gather uncertainty: (see http://krasserm.github.io/2019/03/14/bayesian-neural-networks/)
-            # inputs, truth, sample_weight
-            preds_mean_b, preds_std_b = self.predict_step_generator(inputs_b, samples)
-
-            preds_mean.append(preds_mean_b)
-            preds_std.append(preds_std_b)
+            preds_b = self.predict_step_generator(inputs_b)
+
+            preds.append(preds_b)
             truth.append(truth_b)
             sample_weight.append(weights_b)
 
         # concat all results
-        preds_mean = tf.concat(preds_mean, axis=0)
-        preds_std = tf.concat(preds_std, axis=0)
+        preds = tf.concat(preds, axis=0)
 
         truth = tf.concat(truth, axis=0)
         sample_weight = tf.concat(sample_weight, axis=0)
 
         # trim off extra from last batch
         truth = truth[:num_samples, :].numpy()
-        preds_mean = preds_mean[:num_samples, :].numpy()
-        preds_std = preds_std[:num_samples, :].numpy()
+        preds = preds[:num_samples, :].numpy()
         sample_weight = sample_weight[:num_samples, :].numpy()
 
         # reset truth back to 0 to compute metrics
@@ -454,21 +407,20 @@ def run_predictions(self, num_samples, iter_, calculate_metrics = True, samples
         # do not continue to calculate metrics. Just return predictions and true values
         if (not calculate_metrics):
             return {
-                'preds_mean': preds_mean,
-                'preds_std': preds_std,
+                'preds': preds,
                 'truth': truth,
                 'weights': sample_weight,
                 'target_dict': None,
                 'auROC': None,
                 'auPRC': None
             }
 
-        assert(preds_mean.shape == sample_weight.shape)
+        assert(preds.shape == sample_weight.shape)
 
         try:
 
             # try/accept for cases with only one class (throws ValueError)
-            target_dict = get_performance(self.dataset.targetmap, preds_mean, truth_reset, sample_weight, self.dataset.predict_targets)
+            target_dict = get_performance(self.dataset.targetmap, preds, truth_reset, sample_weight, self.dataset.predict_targets)
 
             # calculate averages
             auROC = np.nanmean(list(map(lambda x: x['AUC'],target_dict.values())))
@@ -482,8 +434,7 @@ def run_predictions(self, num_samples, iter_, calculate_metrics = True, samples
             tf.compat.v1.logging.info("Failed to calculate metrics")
 
         return {
-            'preds_mean': preds_mean,
-            'preds_std': preds_std,
+            'preds': preds,
             'truth': truth,
             'weights': sample_weight,
             'target_dict': target_dict,
@@ -521,23 +472,20 @@ def score_whole_genome(self, similarity_peak_files,
 
         print("scoring %i regions" % idx.shape[0])
 
-        # tuple of means and stds
         predictions = self.eval_vector(peak_matrix, idx)
-        print("finished predictions...", predictions[0].shape)
-
-        # zip together means for each position in idx
+        print("finished predictions...", predictions.shape)
 
         # return matrix of region, TF information. trim off idx column
         npRegions = regions.df.sort_values(by='idx').values[:,:-1]
         # TODO turn into right types (all strings right now)
         # predictions[0] is means of size n regions by # ChIP-seq peaks predicted
-        means = np.concatenate([npRegions, predictions[0]], axis=1)
+        preds = np.concatenate([npRegions, predictions], axis=1)
 
         # can load back in using:
         # > loaded = np.load('file_prefix.npz')
-        # > loaded['means'], loaded['stds']
+        # > loaded['preds']
         # TODO: save the right types!  (currently all strings!)
-        np.savez_compressed(file_prefix, means = means,
+        np.savez_compressed(file_prefix, preds = preds,
                             names=np.array(['chr','start','end'] + self.dataset.predict_targets))
 
         print("columns for matrices are chr, start, end, %s" % ", ".join(self.dataset.predict_targets))
@@ -573,14 +521,14 @@ def score_matrix(self, accessilibility_peak_matrix, regions, all_data = None):
 
         # TODO 9/10/2020: should do something more efficiently than a for loop
         for sample_i in tqdm.tqdm(range(accessilibility_peak_matrix.shape[0])):
-            # tuple of means and stds
+
             peaks_i = np.zeros((len(self.dataset.regions)))
             peaks_i[idx] = accessilibility_peak_matrix[sample_i, joined['idx']]
 
-            means, _ = self.eval_vector(peaks_i, idx, samples = 1)
+            preds = self.eval_vector(peaks_i, idx)
 
-            # group means by joined['idx']
-            results.append(means)
+            # group preds by joined['idx']
+            results.append(preds)
 
         # stack all samples along 0th axis
         tmp = np.stack(results)
@@ -635,29 +583,24 @@ def score_peak_file(self, similarity_peak_files, regions_peak_file):
         if len(idx) == 0:
             raise ValueError("No positive peaks found in %s" % regions_peak_file)
 
-        # tuple of means and stds
-        means, stds = self.eval_vector(peak_matrix, idx)
-        print("finished predictions...", means.shape)
-
-        assert type(means) == type(stds), "Means and STDs variables not of the same type"
-        if not isinstance(means, np.ndarray):
-            means = means.numpy()
-            stds = stds.numpy()
+        preds = self.eval_vector(peak_matrix, idx)
+        print("finished predictions...", preds.shape)
 
+        if not isinstance(preds, np.ndarray):
+            preds = preds.numpy()
 
-        means_df =  pd.DataFrame(data=means, columns=self.dataset.predict_targets)
-        std_cols = list(map(lambda x: x + "_stds",self.dataset.predict_targets))
-        stds_df =  pd.DataFrame(data=stds, columns=std_cols)
+        preds_df =  pd.DataFrame(data=preds, columns=self.dataset.predict_targets)
 
         # read in regions file and filter by indices that were scored
         p = self.dataset.regions.df
         p['idx']=p.index # keep original bed region ordering using idx column
         p.columns = ['Chromosome', 'Start','End','idx']
         prediction_positions = p[p['idx'].isin(idx)] # select regions that were scored
+
         # reset index to match predictions shape
         prediction_positions = prediction_positions.reset_index()
         prediction_positions['idx'] = prediction_positions.index
-        prediction_positions = pd.concat([prediction_positions,means_df,stds_df],axis=1)
+        prediction_positions = pd.concat([prediction_positions, preds_df],axis=1)
         prediction_positions_pr = pr.PyRanges(prediction_positions, int64=True).sort()
 
         original_file = bed2Pyranges(regions_peak_file)
@@ -727,16 +670,12 @@ def create_model(self, **kwargs):
         # make a channel for each cell type
         cell_channels = []
 
-        # TODO resize by max iterations. 5000 is an estimate for data size
-        kl_divergence_function = (lambda q, p, _: tfp.distributions.kl_divergence(q, p) /
-                            tf.cast(self.batch_size * 5000, dtype=tf.float32))
         for i in range(len(self.num_inputs)):
             # make input layer for cell
             last = cell_inputs[i]
             for j in range(self.layers):
                 num_units = int(self.num_inputs[i]/(2 * (j+1)))
-                d = tfp.layers.DenseFlipout(num_units,
-                                                kernel_divergence_fn=kl_divergence_function,
+                d = tf.keras.layers.Dense(num_units,
                                                 activation = self.activation)(last)
                 last = d
             cell_channels.append(last)
@@ -748,8 +687,7 @@ def create_model(self, **kwargs):
         else:
             last = cell_channels[0]
 
-        outputs = tfp.layers.DenseFlipout(self.num_outputs,
-                                        kernel_divergence_fn=kl_divergence_function,
+        outputs =  tf.keras.layers.Dense(self.num_outputs,
                                         activity_regularizer=tf.keras.regularizers.l1_l2(self.l1, self.l2),
                                         name="output_layer")(last)
 

diff --git a/epitome/test/models_test.py b/epitome/test/models_test.py
@@ -50,14 +50,14 @@ def test_train_model(self):
 
 		# Make sure predictions are not random
 		# after first iterations
-		assert(results1['preds_mean'].shape[0] == self.validation_size)
-		assert(results2['preds_mean'][0] < results1['preds_mean'].shape[0])
+		assert(results1['preds'].shape[0] == self.validation_size)
+		assert(results2['preds'][0] < results1['preds'].shape[0])
 
 	def test_test_model(self):
 
 		# make sure can run in test mode
 		results = self.model.test(self.validation_size, mode=Dataset.TEST)
-		assert(results['preds_mean'].shape[0] == self.validation_size)
+		assert(results['preds'].shape[0] == self.validation_size)
 
 	def test_specify_assays(self):
 		# test for https://github.com/YosefLab/epitome/issues/23
@@ -101,15 +101,15 @@ def test_eval_vector(self):
 		# should be able to evaluate on a dnase vector
 		similarity_matrix = np.ones(self.model.dataset.get_data(Dataset.TRAIN).shape[1])[None,:]
 		results = self.model.eval_vector(similarity_matrix, np.arange(0,20))
-		assert(results[0].shape[0] == 20)
+		assert(results.shape[0] == 20)
 
 	def test_save_model(self):
 		# should save and re-load model
 		tmp_path = self.tmpFile()
 		self.model.save(tmp_path)
 		loaded_model = VLP(checkpoint=tmp_path)
 		results = loaded_model.test(self.validation_size)
-		assert(results['preds_mean'].shape[0] == self.validation_size)
+		assert(results['preds'].shape[0] == self.validation_size)
 
 	def test_score_matrix(self):
 
@@ -181,9 +181,9 @@ def test_score_whole_genome(self):
 		loaded = np.load(file_prefix_name + ".npz", allow_pickle=True)
 
 		file_prefix.close()
-		assert 'means' in loaded.keys() and 'names' in loaded.keys()
+		assert 'preds' in loaded.keys() and 'names' in loaded.keys()
 
-		preds = loaded['means']
+		preds = loaded['preds']
 		names = loaded['names']
 		assert preds.shape == (200,4)
 		assert names.shape[0] == 4 # chr, start, end, CTCF