Skip to content

Commit

Permalink
Remove stds (#55)
Browse files Browse the repository at this point in the history
* remove stds from return 

Co-authored-by: Alyssa Morrow <akmorrow@s122.Millennium.Berkeley.EDU>
  • Loading branch information
akmorrow13 and Alyssa Morrow authored Jan 28, 2021
1 parent a0f7cc6 commit 4dc4856
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 111 deletions.
1 change: 0 additions & 1 deletion epitome/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ def pyranges2Vector(pr1, pr2):
prs = [(pr2, pr1, False), (pr1, pr2, True)]
pool = multiprocessing.Pool(processes=2)
results = pool.map(pyranges_intersect, prs)

pool.close()
pool.join()

Expand Down
144 changes: 41 additions & 103 deletions epitome/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

from epitome import *
import tensorflow as tf
import tensorflow_probability as tfp

from .functions import *
from .constants import *
Expand All @@ -37,7 +36,6 @@
class VariationalPeakModel():
"""
Model for learning from ChIP-seq peaks.
Modeled from `this Bayesian Neural Network <https://github.com/tensorflow/probability/blob/master/tensorflow_probability/examples/bayesian_neural_network.py>`_.
"""

def __init__(self,
Expand Down Expand Up @@ -129,29 +127,6 @@ def __init__(self,
self.debug = debug
self.model = self.create_model()

def get_weight_parameters(self):
'''
Extracts weight posterior statistics for layers with weight distributions.
:param keras model model: keras model
:return: triple of layer names, weight means for each layer and stddev for each layer.
:rtype: tuple
'''

names = []
qmeans = []
qstds = []
for i, layer in enumerate(self.model.layers):
try:
q = layer.kernel_posterior
except AttributeError:
continue
names.append("Layer {}".format(i))
qmeans.append(q.mean())
qstds.append(q.stddev())

return (names, qmeans, qstds)

def save(self, checkpoint_path):
'''
Saves model.
Expand Down Expand Up @@ -245,24 +220,20 @@ def train_step(f):
with tf.GradientTape() as tape:

logits = self.model(features, training=True)
kl_loss = tf.reduce_sum(self.model.losses)
neg_log_likelihood = self.loss_fn(labels, logits, weights)
elbo_loss = neg_log_likelihood + kl_loss
loss = self.loss_fn(labels, logits, weights)

gradients = tape.gradient(elbo_loss, self.model.trainable_variables)
gradients = tape.gradient(loss, self.model.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

return elbo_loss, neg_log_likelihood, kl_loss
return loss

@tf.function
def loopiter():
for step, f in enumerate(self.train_iter):
loss = train_step(f)

if step % 100 == 0:
tf.compat.v1.logging.info(str(step) + " " + str(tf.reduce_mean(loss[0])) +
str(tf.reduce_mean(loss[1])) +
str(tf.reduce_mean(loss[2])))
if step % 1000 == 0:
tf.print("Step: ", step)
tf.print("\tLoss: ", tf.reduce_mean(loss))

if (self.debug):
tf.compat.v1.logging.info("On validation")
Expand Down Expand Up @@ -308,7 +279,7 @@ def test_from_generator(self, num_samples, ds, calculate_metrics=True):

return self.run_predictions(num_samples, ds, calculate_metrics)

def eval_vector(self, matrix, indices, samples = 50):
def eval_vector(self, matrix, indices):
'''
Evaluates a new cell type based on its chromatin (DNase or ATAC-seq) vector, as well
as any other similarity targets (acetylation, methylation, etc.). len(vector) should equal
Expand Down Expand Up @@ -337,15 +308,15 @@ def eval_vector(self, matrix, indices, samples = 50):

results = self.run_predictions(num_samples, ds, calculate_metrics = False)

return results['preds_mean'], results['preds_std']
return results['preds']

@tf.function
def predict_step_matrix(self, inputs):
"""
Runs predictions on numpy matrix from _predict
:param np.ndarray inputs: numpy matrix of examples x features
:return: mean of predictions
:return: predictions
:rtype: np.ndarray
"""

Expand All @@ -357,7 +328,7 @@ def predict_step_matrix(self, inputs):
split_inputs = tf.split(tf.dtypes.cast(inputs, tf.float32), cell_lens, axis=1)

# predict
return predict_step_generator(split_inputs, samples=1)[0]
return self.predict_step_generator(split_inputs)

def _predict(self, numpy_matrix):
'''
Expand All @@ -372,33 +343,23 @@ def _predict(self, numpy_matrix):
return self.predict_step_matrix(numpy_matrix)

@tf.function
def predict_step_generator(self, inputs_b, samples = 50):
def predict_step_generator(self, inputs_b):
'''
Runs predictions on inputs from run_predictions
:param tf.Tensor inputs_b: batch of input data
:param int samples: Number of samples to test. Defaults to 50.
:return: mean and standard deviations of predictions
:return: predictions
:rtype: tuple
'''
return tf.sigmoid(self.model(inputs_b))

# sample n times by tiling batch by rows, running
# predictions for each row
inputs_tiled = [tf.tile(i, (samples, 1)) for i in inputs_b]
tmp = self.model(inputs_tiled)
y_pred = tf.sigmoid(tmp)
# split up batches into a third dimension and stack them in third dimension
preds = tf.stack(tf.split(y_pred, samples, axis=0), axis=0)
return tf.math.reduce_mean(preds, axis=0), tf.math.reduce_std(preds, axis=0)

def run_predictions(self, num_samples, iter_, calculate_metrics = True, samples = 50):
def run_predictions(self, num_samples, iter_, calculate_metrics = True):
'''
Runs predictions on num_samples records
:param int num_samples: number of samples to test
:param tf.dataset iter_: output of self.sess.run(generator_to_one_shot_iterator()), handle to one shot iterator of records
:param bool calculate_metrics: whether to return auROC/auPR
:param int samples: number of times to sample from network
:return: dict of preds, truth, target_dict, auROC, auPRC, False
preds = predictions,
Expand All @@ -414,36 +375,28 @@ def run_predictions(self, num_samples, iter_, calculate_metrics = True, samples

# empty arrays for concatenation
truth = []
preds_mean = []
preds_std = []
preds = []
sample_weight = []

for f in tqdm.tqdm(iter_.take(batches)):
inputs_b = f[:-2]
truth_b = f[-2]
weights_b = f[-1]
# Calculate epistemic uncertainty for batch by iterating over a certain number of times,
# getting y_preds. You can then calculate the mean and sigma of the predictions,
# and use this to gather uncertainty: (see http://krasserm.github.io/2019/03/14/bayesian-neural-networks/)
# inputs, truth, sample_weight
preds_mean_b, preds_std_b = self.predict_step_generator(inputs_b, samples)

preds_mean.append(preds_mean_b)
preds_std.append(preds_std_b)
preds_b = self.predict_step_generator(inputs_b)

preds.append(preds_b)
truth.append(truth_b)
sample_weight.append(weights_b)

# concat all results
preds_mean = tf.concat(preds_mean, axis=0)
preds_std = tf.concat(preds_std, axis=0)
preds = tf.concat(preds, axis=0)

truth = tf.concat(truth, axis=0)
sample_weight = tf.concat(sample_weight, axis=0)

# trim off extra from last batch
truth = truth[:num_samples, :].numpy()
preds_mean = preds_mean[:num_samples, :].numpy()
preds_std = preds_std[:num_samples, :].numpy()
preds = preds[:num_samples, :].numpy()
sample_weight = sample_weight[:num_samples, :].numpy()

# reset truth back to 0 to compute metrics
Expand All @@ -454,21 +407,20 @@ def run_predictions(self, num_samples, iter_, calculate_metrics = True, samples
# do not continue to calculate metrics. Just return predictions and true values
if (not calculate_metrics):
return {
'preds_mean': preds_mean,
'preds_std': preds_std,
'preds': preds,
'truth': truth,
'weights': sample_weight,
'target_dict': None,
'auROC': None,
'auPRC': None
}

assert(preds_mean.shape == sample_weight.shape)
assert(preds.shape == sample_weight.shape)

try:

# try/accept for cases with only one class (throws ValueError)
target_dict = get_performance(self.dataset.targetmap, preds_mean, truth_reset, sample_weight, self.dataset.predict_targets)
target_dict = get_performance(self.dataset.targetmap, preds, truth_reset, sample_weight, self.dataset.predict_targets)

# calculate averages
auROC = np.nanmean(list(map(lambda x: x['AUC'],target_dict.values())))
Expand All @@ -482,8 +434,7 @@ def run_predictions(self, num_samples, iter_, calculate_metrics = True, samples
tf.compat.v1.logging.info("Failed to calculate metrics")

return {
'preds_mean': preds_mean,
'preds_std': preds_std,
'preds': preds,
'truth': truth,
'weights': sample_weight,
'target_dict': target_dict,
Expand Down Expand Up @@ -521,23 +472,20 @@ def score_whole_genome(self, similarity_peak_files,

print("scoring %i regions" % idx.shape[0])

# tuple of means and stds
predictions = self.eval_vector(peak_matrix, idx)
print("finished predictions...", predictions[0].shape)

# zip together means for each position in idx
print("finished predictions...", predictions.shape)

# return matrix of region, TF information. trim off idx column
npRegions = regions.df.sort_values(by='idx').values[:,:-1]
# TODO turn into right types (all strings right now)
# predictions[0] is means of size n regions by # ChIP-seq peaks predicted
means = np.concatenate([npRegions, predictions[0]], axis=1)
preds = np.concatenate([npRegions, predictions], axis=1)

# can load back in using:
# > loaded = np.load('file_prefix.npz')
# > loaded['means'], loaded['stds']
# > loaded['preds']
# TODO: save the right types! (currently all strings!)
np.savez_compressed(file_prefix, means = means,
np.savez_compressed(file_prefix, preds = preds,
names=np.array(['chr','start','end'] + self.dataset.predict_targets))

print("columns for matrices are chr, start, end, %s" % ", ".join(self.dataset.predict_targets))
Expand Down Expand Up @@ -573,14 +521,14 @@ def score_matrix(self, accessilibility_peak_matrix, regions, all_data = None):

# TODO 9/10/2020: should do something more efficiently than a for loop
for sample_i in tqdm.tqdm(range(accessilibility_peak_matrix.shape[0])):
# tuple of means and stds

peaks_i = np.zeros((len(self.dataset.regions)))
peaks_i[idx] = accessilibility_peak_matrix[sample_i, joined['idx']]

means, _ = self.eval_vector(peaks_i, idx, samples = 1)
preds = self.eval_vector(peaks_i, idx)

# group means by joined['idx']
results.append(means)
# group preds by joined['idx']
results.append(preds)

# stack all samples along 0th axis
tmp = np.stack(results)
Expand Down Expand Up @@ -635,29 +583,24 @@ def score_peak_file(self, similarity_peak_files, regions_peak_file):
if len(idx) == 0:
raise ValueError("No positive peaks found in %s" % regions_peak_file)

# tuple of means and stds
means, stds = self.eval_vector(peak_matrix, idx)
print("finished predictions...", means.shape)

assert type(means) == type(stds), "Means and STDs variables not of the same type"
if not isinstance(means, np.ndarray):
means = means.numpy()
stds = stds.numpy()
preds = self.eval_vector(peak_matrix, idx)
print("finished predictions...", preds.shape)

if not isinstance(preds, np.ndarray):
preds = preds.numpy()

means_df = pd.DataFrame(data=means, columns=self.dataset.predict_targets)
std_cols = list(map(lambda x: x + "_stds",self.dataset.predict_targets))
stds_df = pd.DataFrame(data=stds, columns=std_cols)
preds_df = pd.DataFrame(data=preds, columns=self.dataset.predict_targets)

# read in regions file and filter by indices that were scored
p = self.dataset.regions.df
p['idx']=p.index # keep original bed region ordering using idx column
p.columns = ['Chromosome', 'Start','End','idx']
prediction_positions = p[p['idx'].isin(idx)] # select regions that were scored

# reset index to match predictions shape
prediction_positions = prediction_positions.reset_index()
prediction_positions['idx'] = prediction_positions.index
prediction_positions = pd.concat([prediction_positions,means_df,stds_df],axis=1)
prediction_positions = pd.concat([prediction_positions, preds_df],axis=1)
prediction_positions_pr = pr.PyRanges(prediction_positions, int64=True).sort()

original_file = bed2Pyranges(regions_peak_file)
Expand Down Expand Up @@ -727,16 +670,12 @@ def create_model(self, **kwargs):
# make a channel for each cell type
cell_channels = []

# TODO resize by max iterations. 5000 is an estimate for data size
kl_divergence_function = (lambda q, p, _: tfp.distributions.kl_divergence(q, p) /
tf.cast(self.batch_size * 5000, dtype=tf.float32))
for i in range(len(self.num_inputs)):
# make input layer for cell
last = cell_inputs[i]
for j in range(self.layers):
num_units = int(self.num_inputs[i]/(2 * (j+1)))
d = tfp.layers.DenseFlipout(num_units,
kernel_divergence_fn=kl_divergence_function,
d = tf.keras.layers.Dense(num_units,
activation = self.activation)(last)
last = d
cell_channels.append(last)
Expand All @@ -748,8 +687,7 @@ def create_model(self, **kwargs):
else:
last = cell_channels[0]

outputs = tfp.layers.DenseFlipout(self.num_outputs,
kernel_divergence_fn=kl_divergence_function,
outputs = tf.keras.layers.Dense(self.num_outputs,
activity_regularizer=tf.keras.regularizers.l1_l2(self.l1, self.l2),
name="output_layer")(last)

Expand Down
14 changes: 7 additions & 7 deletions epitome/test/models_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@ def test_train_model(self):

# Make sure predictions are not random
# after first iterations
assert(results1['preds_mean'].shape[0] == self.validation_size)
assert(results2['preds_mean'][0] < results1['preds_mean'].shape[0])
assert(results1['preds'].shape[0] == self.validation_size)
assert(results2['preds'][0] < results1['preds'].shape[0])

def test_test_model(self):

# make sure can run in test mode
results = self.model.test(self.validation_size, mode=Dataset.TEST)
assert(results['preds_mean'].shape[0] == self.validation_size)
assert(results['preds'].shape[0] == self.validation_size)

def test_specify_assays(self):
# test for https://github.com/YosefLab/epitome/issues/23
Expand Down Expand Up @@ -101,15 +101,15 @@ def test_eval_vector(self):
# should be able to evaluate on a dnase vector
similarity_matrix = np.ones(self.model.dataset.get_data(Dataset.TRAIN).shape[1])[None,:]
results = self.model.eval_vector(similarity_matrix, np.arange(0,20))
assert(results[0].shape[0] == 20)
assert(results.shape[0] == 20)

def test_save_model(self):
# should save and re-load model
tmp_path = self.tmpFile()
self.model.save(tmp_path)
loaded_model = VLP(checkpoint=tmp_path)
results = loaded_model.test(self.validation_size)
assert(results['preds_mean'].shape[0] == self.validation_size)
assert(results['preds'].shape[0] == self.validation_size)

def test_score_matrix(self):

Expand Down Expand Up @@ -181,9 +181,9 @@ def test_score_whole_genome(self):
loaded = np.load(file_prefix_name + ".npz", allow_pickle=True)

file_prefix.close()
assert 'means' in loaded.keys() and 'names' in loaded.keys()
assert 'preds' in loaded.keys() and 'names' in loaded.keys()

preds = loaded['means']
preds = loaded['preds']
names = loaded['names']
assert preds.shape == (200,4)
assert names.shape[0] == 4 # chr, start, end, CTCF
Expand Down

0 comments on commit 4dc4856

Please sign in to comment.