Skip to content

Commit

Permalink
Merge pull request #19 from akmorrow13/master
Browse files Browse the repository at this point in the history
Updated sampling technique for multiclass models
  • Loading branch information
akmorrow13 authored Jun 24, 2020
2 parents e247c23 + 19615a3 commit 68c0f50
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 96 deletions.
4 changes: 1 addition & 3 deletions epitome/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def get_y_indices_for_cell(matrix, cellmap, cell):
return np.copy(matrix[cellmap[cell]])


def get_y_indices_for_assay(arrays, assaymap, assay):
def get_y_indices_for_assay(matrix, assaymap, assay):
"""
Gets indices for a assay.
Expand All @@ -126,8 +126,6 @@ def get_y_indices_for_assay(arrays, assaymap, assay):
:return locations of indices for the cell name specified
"""
# get column for this assay
matrix = output = np.array(arrays)
return np.copy(matrix[:,assaymap[assay]])

def get_missing_indices_for_cell(matrix, cellmap, cell):
Expand Down
94 changes: 59 additions & 35 deletions epitome/generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import tensorflow as tf
from .constants import *
from .functions import *
import epitome.iio as iio
from .sampling import *
import glob

######################### Original Data Generator: Only peak based #####################
Expand All @@ -25,6 +25,7 @@ def load_data(data,
mode = Dataset.TRAIN,
similarity_matrix = None,
indices = None,
return_feature_names = False,
**kwargs):
"""
Takes Deepsea data and calculates distance metrics from cell types whose locations
Expand All @@ -50,6 +51,21 @@ def load_data(data,

# for now, we require DNase to be part of the similarity comparison
assert('DNase' in similarity_assays)

# get indices for features.rows are cells and cols are assays
cellmap_idx = [cellmap[c] for c in list(eval_cell_types)]
feature_cell_indices = matrix[cellmap_idx,:]

# indices to be deleted used for similarity comparison
delete_indices = np.array([assaymap[s] for s in similarity_assays])

# make sure no similarity comparison data is missing for all cell types
assert np.invert(np.any(feature_cell_indices[:,delete_indices] == -1)), \
"missing data at %s" % (np.where(feature_cell_indices[:,delete_indices] == -1)[0])

# names of labels that are being predicted
feature_assays = [a for a in list(assaymap)] # assays used as features for each evaluation cell type
label_assays = [a for a in feature_assays if a not in similarity_assays]

if (not isinstance(mode, Dataset)):
raise ValueError("mode is not a Dataset enum")
Expand All @@ -61,23 +77,18 @@ def load_data(data,
list(cellmap))))
feature_indices = feature_indices[feature_indices != -1]

# need to re-proportion the indices to equalize positives
# need to re-proportion the indices to oversample underrepresented labels
if (len(list(assaymap)) > 2):

# get sums for each feature in the dataset
rowsums = np.sum(data[feature_indices,:], axis=1)

# multiply data by row scaling factor
scale_factor = 1/rowsums
scaled = data[feature_indices,:] * scale_factor[:, np.newaxis]

# indices where sum > 0
indices_zero = np.where(np.sum(scaled, axis=0) > 0)[0]
# then filter indices by probabilities inversely proportional to frequency
indices = np.random.choice(indices_zero, int(indices_zero.shape[0] * 0.4), p=(np.sum(scaled, axis=0)/np.sum(scaled))[indices_zero])

# configure y: label matrix of ChIP for all assays from all cell lines in train
indices = np.concatenate([get_y_indices_for_assay(matrix, assaymap, assay) for assay in label_assays])
indices = indices[indices != -1]
y = data[indices, :].T
m = MLSMOTE(y)
indices = m.fit_resample()

else:
# single TF model
# get indices for DNAse and chip for this mark
feature_indices = np.concatenate(list(map(lambda c: get_y_indices_for_cell(matrix, cellmap, c),
list(cellmap))))

Expand All @@ -88,21 +99,23 @@ def load_data(data,
TF_indices = TF_indices[TF_indices != -1]
feature_indices = feature_indices[feature_indices != -1]

# sites where TF is in at least 1 cell line
# sites where TF is bound in at least 2 cell line
positive_indices = np.where(np.sum(data[TF_indices,:], axis=0) > 1)[0]

indices_probs = np.ones([data.shape[1]])
indices_probs[positive_indices] = 0
indices_probs = indices_probs/np.sum(indices_probs, keepdims=1)

# randomly select 10 fold sites where TF is not in any cell line
negative_indices = np.random.choice(np.arange(0,data.shape[1]), positive_indices.shape[0] * 10,p=indices_probs)
negative_indices = np.random.choice(np.arange(0,data.shape[1]),
positive_indices.shape[0] * 10,
p=indices_probs)
indices = np.sort(np.concatenate([negative_indices, positive_indices]))


else:
indices = range(0, data.shape[-1]) # not training mode, set to all points


if (mode == Dataset.RUNTIME):
label_cell_types = ["PLACEHOLDER_CELL"]
if similarity_matrix is None:
Expand All @@ -116,21 +129,11 @@ def load_data(data,
# string of radii for meta data labeling
radii_str = list(map(lambda x: "RADII_%i" % x, radii))

# get indices for features.rows are cells and cols are assays
cellmap_idx = [cellmap[c] for c in list(eval_cell_types)]
feature_cell_indices = matrix[cellmap_idx,:]

# indices to be deleted used for similarity comparison
delete_indices = np.array([assaymap[s] for s in similarity_assays])

# make sure no similarity comparison data is missing for all cell types
assert np.invert(np.any(feature_cell_indices[:,delete_indices] == -1)), \
"missing data at %s" % (np.where(feature_cell_indices[:,delete_indices] == -1)[0])

def g():
for i in indices: # for all records specified
feature_names = []

for (cell) in label_cell_types: # for all cell types to be used in labels

similarities_double_positive = np.empty([len(eval_cell_types),0])
similarities_agreement = np.empty([len(eval_cell_types),0])

Expand Down Expand Up @@ -158,6 +161,9 @@ def g():
# for cell types that are going to be features
similarity_indices = feature_cell_indices[:, delete_indices]

similarity_labels_agreement = []
similarity_labels_dp = []

for r, radius in enumerate(radii):

min_radius = max(0, i - radius + 1)
Expand Down Expand Up @@ -199,24 +205,33 @@ def g():
similarity_agreement = np.average(cell_train_data ==
cell_label_data, axis=-1)

similarity_labels_agreement.append('r%i_%s' % (radius, 'agree'))
similarity_labels_dp.append('r%i_%s' % (radius, 'dp'))

similarities_double_positive = np.concatenate([similarities_double_positive,similarity_double_positive],axis=1)
similarities_agreement = np.concatenate([similarities_agreement,similarity_agreement],axis=1)

# rehape agreement assay similarity to Radii by feature_cells
similarities = np.concatenate([similarities_agreement, similarities_double_positive], axis=1)
similarity_labels = np.concatenate([similarity_labels_agreement, similarity_labels_dp])

final = []
for j,c in enumerate(eval_cell_types):
# get indices for this cell that has data
present_indices = feature_cell_indices[j,:]
present_indices = present_indices[present_indices!=-1]

cell_features = data[present_indices,i]
cell_similarities = similarities[j,:]
concat = np.concatenate([cell_features, cell_similarities])
if c == cell: # if eval cell write out missing values
final.append(np.zeros(len(concat)))
else:
final.append(concat)
final.append(concat)

# concatenate together feature names
tmp = np.array(feature_assays)[feature_cell_indices[j,:] != -1]
al = ['%s_%s' % (c, a) for a in tmp]
sl = ['%s_%s' % (c, s) for s in similarity_labels]

feature_names.append(np.concatenate([al, sl]))


if (mode != Dataset.RUNTIME):
Expand All @@ -228,12 +243,21 @@ def g():

# append labels and assaymask
final.append(labels.astype(np.float32))
feature_names.append(['lbl_%s_%s' % (cell, a) for a in label_assays]) # of form lbl_cellline_target

final.append(assay_mask.astype(np.float32))
yield tuple(final)
feature_names.append(['mask_%s_%s' % (cell, a) for a in label_assays]) # of form mask_cellline_target

if (return_feature_names):
yield (tuple(final), tuple(feature_names))
else:
yield tuple(final)


return g



def generator_to_tf_dataset(g, batch_size, shuffle_size, prefetch_size):
"""
Generates a tensorflow dataset from a data generator.
Expand Down
Loading

0 comments on commit 68c0f50

Please sign in to comment.