diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..284fb46 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include README.md +include lisa/config.ini +include docs/* \ No newline at end of file diff --git a/bin/lisa b/bin/lisa new file mode 100644 index 0000000..4d51141 --- /dev/null +++ b/bin/lisa @@ -0,0 +1,213 @@ +#!/Users/alynch/projects/.venvs/lisa_test/bin/python3 + +from lisa import LISA, Log, _config, __version__ +import configparser +import argparse +import os +import sys + +#____COMMAND LINE INTERFACE________ + +INSTANTIATION_KWARGS = ['cores','isd_method','verbose','oneshot'] +PREDICTION_KWARGS = ['background_list','num_background_genes','background_strategy', 'seed'] + +def extract_kwargs(args, keywords): + return {key : vars(args)[key] for key in keywords} + +def is_valid_prefix(prefix): + + if os.path.isdir(prefix) or os.path.isfile(prefix) or os.path.isdir(os.path.dirname(prefix)): + return prefix + + raise argparse.ArgumentTypeError('{}: Invalid file prefix.'.format(prefix)) + +def lisa_download(args): + lisa = LISA(args.species) + lisa._download_data() + + +def lisa_oneshot(args): + + results, metadata = LISA(args.species, **extract_kwargs(args, INSTANTIATION_KWARGS)).predict(args.query_list.readlines(), **extract_kwargs(args, PREDICTION_KWARGS)) + + if args.save_metadata: + if args.output_prefix: + metadata_filename = args.output_prefix + '.metadata.json' + else: + metadata_filename = os.path.basename(args.query_list.name) + '.metadata.json' + + with open(metadata_filename, 'w') as f: + f.write(json.dumps(metadata, indent=4)) + + if not args.output_prefix is None: + with open(args.output_prefix + '.lisa.tsv', 'w') as f: + f.write(results.to_tsv()) + else: + print(results.to_tsv()) + + +def save_and_get_top_TFs(args, query_name, results, metadata): + + with open(args.output_prefix + query_name + '.lisa.tsv', 'w') as f: + f.write(results.to_tsv()) + + if args.save_metadata: + with open(args.output_prefix + query_name + '.metadata.json', 'w') as f: + f.write(json.dumps(metadata, indent=4)) + + top_TFs = results.filter_rows(lambda x : x <= 0.05, 'combined_p_value_adjusted').todict()['factor'] + + top_TFs_unique, encountered = [], set() + for TF in top_TFs: + if not TF in encountered: + top_TFs_unique.append(TF) + encountered.add(TF) + + return top_TFs_unique + + +def print_results_multi(results_summary): + print('Sample\tTop Regulatory Factors (p < 0.05)') + for result_line in results_summary: + print(result_line[0], ', '.join(result_line[1]), sep = '\t') + + +def lisa_multi(args): + + log = Log(target = sys.stderr, verbose = args.verbose) + lisa = LISA(args.species, **extract_kwargs(args, INSTANTIATION_KWARGS), log = log) + + query_dict = {os.path.basename(query.name) : query.readlines() for query in args.query_lists} + + results_summary = [] + for query_name, query_list in query_dict.items(): + + with log.section('Modeling {}:'.format(str(query_name))): + try: + results, metadata = lisa.predict(query_list, **extract_kwargs(args, PREDICTION_KWARGS)) + + top_TFs_unique = save_and_get_top_TFs(args, query_name, results, metadata) + + results_summary.append((query_name, top_TFs_unique)) + + except AssertionError as err: + log.append(str(err)) + + print_results_multi(results_summary) + + +def lisa_one_v_rest(args): + + log = Log(target = sys.stderr, verbose = args.verbose) + lisa = LISA(args.species, **extract_kwargs(args, INSTANTIATION_KWARGS), log = log) + + queries = {query.name : query.readlines() for query in args.query_lists} + + cluster_lists = { + query_name : (query_genes, [ + background_gene + for j, background in enumerate(queries.values()) for background_gene in background if not j == i + ]) + for i, (query_name, query_genes) in enumerate(queries.items()) + } + + results_summary = [] + for query_name, genelists in cluster_lists.items(): + + with log.section('Modeling {}:'.format(str(query_name))): + try: + results, metadata = lisa.predict(genelists[0], background_list = genelists[1], **extract_kwargs(args, PREDICTION_KWARGS)) + + top_TFs_unique = save_and_get_top_TFs(args, query_name, results, metadata) + + results_summary.append((query_name, top_TFs_unique)) + + except AssertionError as err: + log.append(str(err)) + + print_results_multi(results_summary) + + +def build_common_args(parser): + parser.add_argument('species', choices = ['hg38','mm10'], help = 'Find TFs associated with human (hg38) or mouse (mm10) genes') + parser.add_argument('-c','--cores', required = True, type = int, default = 1) + parser.add_argument('--seed', type = int, default = None, help = 'Random seed for gene selection. Allows for reproducing exact results.') + parser.add_argument('--use_motifs', action = 'store_const', const = 'motifs', default='chipseq', + dest = 'isd_method', help = 'Use motif hits instead of ChIP-seq peaks to represent TF binding (only recommended if TF-of-interest is not represented in ChIP-seq database).') + parser.add_argument('--save_metadata', action = 'store_true', default = False, help = 'Save json-formatted metadata from processing each gene list.') + + +def build_multiple_lists_args(parser): + parser.add_argument('query_lists', type = argparse.FileType('r', encoding = 'utf-8'), nargs = "+", help = 'user-supplied gene lists. One gene per line in either symbol or refseqID format') + parser.add_argument('-o','--output_prefix', required = True, type = is_valid_prefix, help = 'Output file prefix.') + parser.add_argument('-v','--verbose',type = int, default = 2) + +if __name__ == "__main__": + + #define command-line arguments + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description = +""" +___________________________________________________________________________________________________________________________ + +Lisa: inferring transcriptional regulators through integrative modeling of public chromatin accessibility and ChIP-seq data +https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-1934-6 +X. Shirley Liu Lab, 2020 +___________________________________________________________________________________________________________________________ +""") + + parser.add_argument('--version', action = 'version', version = __version__) + + subparsers = parser.add_subparsers(help = 'commands') + + #__ LISA oneshot command __################# + + oneshot_parser = subparsers.add_parser('oneshot', help = 'Use LISA to infer genes from one gene list. If you have multiple lists, this option will be slower than using "multi" due to data-loading time.\n') + build_common_args(oneshot_parser) + + oneshot_parser.add_argument('query_list', type = argparse.FileType('r', encoding = 'utf-8'), help = 'user-supplied gene lists. One gene per line in either symbol or refseqID format') + oneshot_parser.add_argument('-o','--output_prefix', required = False, type = is_valid_prefix, help = 'Output file prefix. If left empty, will write results to stdout.') + oneshot_parser.add_argument('--background_strategy', choices = _config.get('lisa_params', 'background_strategies').split(','), + default = 'regulatory', + help = """Background genes selection strategy. LISA samples background genes to compare to user\'s genes-of-interest from a diverse + regulatory background (regulatory - recommended), randomly from all genes (random), or uses a user-provided list (provided). + """) + background_genes_group = oneshot_parser.add_mutually_exclusive_group() + background_genes_group.add_argument('--background_list', type = argparse.FileType('r', encoding = 'utf-8'), required = False, + help = 'user-supplied list of backgroung genes. Used when --background_strategy flag is set to "provided"') + background_genes_group.add_argument('-b','--num_background_genes', type = int, default = _config.get('lisa_params', 'background_genes'), + help = 'Number of sampled background genes to compare to user-supplied genes') + oneshot_parser.add_argument('-v','--verbose',type = int, default = 4) + oneshot_parser.set_defaults(func = lisa_oneshot, oneshot = True) + + #__ LISA multi command __################# + + multi_parser = subparsers.add_parser('multi', help = 'Process multiple genelists. This reduces data-loading time if using the same parameters for all lists.\n') + build_common_args(multi_parser) + build_multiple_lists_args(multi_parser) + multi_parser.add_argument('-b','--num_background_genes', type = int, default = _config.get('lisa_params', 'background_genes'), + help = 'Number of sampled background genes to compare to user-supplied genes. These genes are selection from other gene lists.') + multi_parser.add_argument('--random_background', action = 'store_const', const = 'random', default = 'regulatory', dest = 'background_strategy', help = 'Use random background selection rather than "regulatory" selection.') + multi_parser.set_defaults(func = lisa_multi, oneshot = False, background_list = None) + + #__ LISA one-vs-rest command __################# + + one_v_rest_parser = subparsers.add_parser('one-vs-rest', help = 'Compare gene lists in a one-vs-rest fashion. Useful downstream of cluster analysis.\n') + build_common_args(one_v_rest_parser) + build_multiple_lists_args(one_v_rest_parser) + one_v_rest_parser.set_defaults(func = lisa_one_v_rest, oneshot = False, background_strategy = 'provided') + + download_data_parser = subparsers.add_parser('download', help = 'Download data from CistromeDB. Use if data recieved is incomplete or malformed.') + download_data_parser.add_argument('species', choices = ['hg38','mm10'], help = 'Download data associated with human (hg38) or mouse (mm10) genes') + download_data_parser.set_defaults(func = lisa_download) + + args = parser.parse_args() + + try: + args.func #first try accessing the .func attribute, which is empty if user tries ">>>lisa". In this case, don't throw error, display help! + except AttributeError: + print(parser.print_help(), file = sys.stderr) + else: + args.func(args) + \ No newline at end of file diff --git a/docs/README.ipynb b/docs/README.ipynb new file mode 100644 index 0000000..5dc94d5 --- /dev/null +++ b/docs/README.ipynb @@ -0,0 +1,125 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!source /Users/alynch/projects/.venvs/lisa_test/bin/activate" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import lisa" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "c = pd.read_csv('/Users/alynch/projects/.venvs/lisa_test/lib/python3.7/site-packages/lisa/data/hg38/metadata/Motifs.tsv',\n", + " sep = '\\t', encoding = 'latin')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 ATF2::JUN\n", + "1 IRF1\n", + "2 TFCP2\n", + "3 RUNX1\n", + "4 ZBTB6\n", + " ... \n", + "1056 ELF2\n", + "1057 Etv5\n", + "1058 Elf4\n", + "1059 Etv6\n", + "1060 Gm5454\n", + "Name: symbol, Length: 1061, dtype: object" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c.symbol" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'edition', 'source', 'sourcefile', 'status', 'numseqs', 'pmid',\n", + " 'dbd', 'family', 'description', 'species', 'cellline', 'entrez',\n", + " 'symbol', 'synonym', 'refseq', 'cluster', 'comment2', 'comment2.1',\n", + " 'comment3', 'comment4', 'comment5', 'datasetid', 'zscore', 'seqfactors',\n", + " 'seqdbds', 'seqdatasetid', 'nmotifs', 'pssm'],\n", + " dtype='object')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bioenv", + "language": "python", + "name": "bioenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/lisa/__init__.py b/lisa/__init__.py new file mode 100644 index 0000000..d17bc76 --- /dev/null +++ b/lisa/__init__.py @@ -0,0 +1,3 @@ +from .lisa import LISA, _config +from .utils import Log +from ._version import __version__ \ No newline at end of file diff --git a/lisa/_version.py b/lisa/_version.py new file mode 100644 index 0000000..d97070a --- /dev/null +++ b/lisa/_version.py @@ -0,0 +1 @@ +__version__ = '2.0.0' \ No newline at end of file diff --git a/lisa/assays.py b/lisa/assays.py new file mode 100644 index 0000000..576ef67 --- /dev/null +++ b/lisa/assays.py @@ -0,0 +1,278 @@ + +from lisa.utils import LoadingBar +from multiprocessing import Pool +import numpy as np +from scipy import stats, sparse +import os + +def get_delta_RP(profile, binding_data, rp_map): + ''' + profile: chromatin profile, bin x 1 array of accessibility at each genomic region + rp_map: sparse bin x gene matrix mapping a genomic region to a gene based on RP proximity + binding_data: bin x dataset matrix, with binary hits to show TF binding occurs in a location + + the line below defines the mapping of chromatin accessibility at TF binding sites to genes. + + returns: gene x TF x 1 matrix of delta RPs + ''' + return np.array(rp_map.dot(binding_data.astype(np.bool).multiply(profile.reshape((-1,1)))).todense())[:,np.newaxis,:] + +#distributes arguments for get_delta_RP function, to be used by multiprocessing module +def delta_RP_wrapper(x): + return get_delta_RP(*x) + +def mannu_test_function(x): + query, background = x + try: + return stats.mannwhitneyu(query, background, alternative = 'greater') + #if all values in query and background are equal (no knockouts), throws value error + except ValueError: + #catch, return none for test statistic, 1.0 for p-val + return (None, 1.0) + + +class LISA_RP_Assay: + + invitro_metadata = ['factor','cell_line','cell_type','tissue'] + insilico_metadata = ['factor','dbd','description','cell_line'] + + def __init__(self, technology, config, cores, log, metadata_headers, metadata_path): + self.config = config + self.technology = technology + self.cores = cores + self.log = log + self.loaded = False + self.subsetted = False + self.metadata_headers = metadata_headers + self.metadata_path = metadata_path + + def load_rp_matrix(self, data_object, gene_mask = None, oneshot = False): + + self.log.append('Loading {} RP matrix ...'.format(self.technology)) + + dataset_ids = data_object[self.config.get('accessibility_assay', 'reg_potential_dataset_ids').format(technology = self.technology)][...].astype(str) + + if oneshot and not gene_mask is None: + self.subsetted = True + rp_matrix = data_object[self.config.get('accessibility_assay', 'reg_potential_matrix').format(technology = self.technology)][gene_mask, :] + else: + rp_matrix = data_object[self.config.get('accessibility_assay', 'reg_potential_matrix').format(technology = self.technology)][...] + + self.loaded = True + return rp_matrix, dataset_ids + + + def get_delta_RP_p_value(self, gene_TF_scores, label_vector): + ''' + gene_TF_scores: gene x TF, model output of delta-RP matrix. more purturbation of genes of interest correspond with higher delta regulation score + ''' + #seperate matrix into query and background sets + query_delta = gene_TF_scores[label_vector.astype(np.bool)] + background_delta = gene_TF_scores[~label_vector.astype(np.bool)] + + #for each TF, calculate p-value of difference in delta-R distributions between query and background genes + test_parameters = list(zip(query_delta.T, background_delta.T)) + + if self.cores == 1: + p_vals = [ + mannu_test_function((q,b)) for q,b in test_parameters + ] + else: + with Pool(self.cores) as p: + p_vals = p.map(mannu_test_function, test_parameters) + + _, p_values = list(zip(*p_vals)) + + return p_values + + def get_info(self): + raise NotImplementedError() + + def predict(self, gene_mask, label_vector, data_object = None, debug = False): + raise NotImplementedError() + + def load_metadata(self): + #reformat metadata tsv into conventiant id-indexed dictionary + with open(self.metadata_path, 'r', encoding = 'latin') as metdata_file: + metadata = [[field.strip() for field in line.split('\t')] for line in metdata_file.readlines()] + + meta_headers, metadata = metadata[0], metadata[1:] + + return {metaline[0] : dict(zip(meta_headers[1:], metaline[1:])) for metaline in metadata} + + def load(self, data_object, gene_mask = None, oneshot = False): + self.rp_matrix, self.dataset_ids = self.load_rp_matrix(data_object, gene_mask = gene_mask, oneshot=oneshot) + self.metadata = self.load_metadata() + + def get_metadata(self, sample_ids): + return dict( + sample_id = sample_ids, + **{ + header : [self.metadata[_id][header] for _id in sample_ids] + for header in self.metadata_headers + } + ) + +class PeakRP_Assay(LISA_RP_Assay): + + def predict(self, gene_mask, label_vector, data_object = None, debug = False): + + try: + self.rp_matrix + except AttributeError: + self.load(data_object, gene_mask=gene_mask) + + self.log.append('Calculating {} peak-RP p-values ...'.format(self.technology)) + + #calculate p-values by directly applying mannu-test on RP matrix. Subset the RP matrix for genes-of-interest if required + p_vals = self.get_delta_RP_p_value(self.rp_matrix[gene_mask, :] if not self.subsetted else self.rp_matrix, label_vector) + + return p_vals + + def get_info(self): + return dict() + + def get_metadata(self): + return super().get_metadata(self.dataset_ids) + +#Generator that repeatedly yields the same factor_binding and rp_map matrices with a new accessibility profile +#to the multiprocessing pool creator. This reduces data redundancy. +class KnockoutGenerator: + + def __init__(self, accessibility_profiles, factor_binding, rp_map): + self.accessibility_profiles = accessibility_profiles + self.rp_map = rp_map + self.factor_binding = factor_binding + + def __iter__(self): + for profile in self.accessibility_profiles.T: + yield profile, self.factor_binding, self.rp_map + +class Accesibility_Assay(LISA_RP_Assay): + + def __init__(self, technology, config, cores, log, metadata_path, *, rp_map, factor_binding, selection_model, chromatin_model, + metadata_headers = LISA_RP_Assay.invitro_metadata): + super().__init__(technology, config, cores, log, metadata_headers, metadata_path) + self.rp_map = rp_map + self.factor_binding = factor_binding + self.selection_model = selection_model + self.chromatin_model = chromatin_model + + def get_info(self): + return dict( + selection_model = self.selection_model.get_info(), + chromatin_model = self.chromatin_model.get_info(), + selected_datasets = self.get_metadata(list(self.selected_dataset_ids)), + ) + + def load_accessibility_profiles(self, data_object, selected_dataset_ids): + + loadingbar = LoadingBar('Reading {} data'.format(self.technology), len(selected_dataset_ids), 20) + + accessibility_profiles = [] + for selected_dataset in selected_dataset_ids: + self.log.append(loadingbar, update_line = True) + accessibility_profiles.append( + data_object[self.config.get('accessibility_assay','binned_reads')\ + .format(technology = self.technology, dataset_id = selected_dataset)][...][:,np.newaxis] + ) + + accessibility_profiles = np.concatenate(accessibility_profiles, axis = 1) + + return accessibility_profiles + + #iterator for distributing these matrices to multiple cores + + + def calculate_ISDs(self, accessibility_profiles, factor_binding, rp_map): #enforced kwargs + """ + log: log object for printing + accessibility_profiles: (bins, num_datasets): list of chromatin-accessiblity datasets on which to analyze the effects of ISD + factor_binding: (num_bins, TFs), a sparse binary map showing bins that contain a chip-seq or motif peak to knock out + rp_map: (num_bins, genes), precalculated matrix mapping the reads in a bin to a regulatory score for each gene (this is a huge matrix) + + returns: + delta_regulatory_score (genes x TFs) + """ + ''' + # Code for non-multiprocessing implementation of this ISD algorithm. Much slower. + knockouts = [] + loadingbar = LoadingBar('Performing in-silico knockouts', accessibility_profiles.shape[1], 20, cold_start = True) + log.append(loadingbar, update_line = True) + for profile in accessibility_profiles.T: + knockouts.append(get_delta_RP(profile, factor_binding, rp_map)) + log.append(loadingbar, update_line = True)''' + + self.log.append('Performing in-silico knockouts ...') + + with Pool(self.cores) as p: + knockouts = p.map(delta_RP_wrapper, iter(KnockoutGenerator(accessibility_profiles, factor_binding, rp_map))) + + #concatenate datasets to for gene x TF x datasets shaped matrix + self.log.append('Calculating Δ regulatory score ...') + datacube = np.concatenate(knockouts, axis = 1) + + num_genes, _, num_TFs = datacube.shape + + delta_regulation_score = self.chromatin_model.get_deltaRP_activation(datacube) + + assert(delta_regulation_score.shape == (num_genes, num_TFs)) + + return delta_regulation_score, datacube + + + def predict(self, gene_mask, label_vector, debug = False, *, data_object): + + try: + self.rp_matrix + except AttributeError: + self.load(data_object, gene_mask= gene_mask) + + #find bins of gene-subsetted rp-map with RP > 0 + bin_mask = np.squeeze(np.array(self.rp_map[gene_mask, : ].tocsc().sum(axis = 0) > 0)) + #subset rp_map and factor hits on bins with RP > 0 + subset_factor_binding = self.factor_binding[bin_mask, :] + + subset_rp_map = self.rp_map[gene_mask, :][:, bin_mask] + + subset_rp_matrix = self.rp_matrix[gene_mask, :] if not self.subsetted else self.rp_matrix + + with self.log.section('Modeling {} purturbations:'.format(self.technology)): + #DNase model building and purturbation + self.log.append('Selecting discriminative datasets and training chromatin model ...') + + #select the most discriminative datasets + dataset_mask = self.selection_model.fit(subset_rp_matrix, label_vector) + #subset the best datasets + subset_rp_matrix, self.selected_dataset_ids = subset_rp_matrix[:, dataset_mask], self.dataset_ids[dataset_mask] + #fit the chromatin model to these datasets + self.chromatin_model.fit(subset_rp_matrix, label_vector, self.cores) + + with self.log.section('Calculating in-silico deletions:'): + + accesibility_profiles = self.load_accessibility_profiles(data_object, self.selected_dataset_ids) + + accesibility_profiles = accesibility_profiles[bin_mask, :] + + delta_reg_scores, datacube = self.calculate_ISDs(accesibility_profiles, subset_factor_binding, subset_rp_map) + + self.log.append('Calculating p-values ...') + + p_vals = self.get_delta_RP_p_value(delta_reg_scores, label_vector) + + self.log.append('Done!') + + if debug: + return p_vals, dict( + gene_mask = gene_mask, + label_vector = label_vector, + subset_rp_matrix = subset_rp_matrix, + subset_rp_map = subset_rp_map, + subset_factor_binding = subset_factor_binding, + datacube = datacube, + delta_reg_scores = delta_reg_scores, + dataset_mask = dataset_mask, + bin_mask = bin_mask, + ) + else: + return p_vals \ No newline at end of file diff --git a/lisa/config.ini b/lisa/config.ini new file mode 100644 index 0000000..f69c709 --- /dev/null +++ b/lisa/config.ini @@ -0,0 +1,47 @@ + + +#Config file specifying the paths to resources used in LISA + +[genes] +master_gene_list: {package_path}/data/{species}/genes.tsv +gene_locs : {package_path}/data/{species}/gene_locs.txt + +[accessibility_assay] +reg_potential_gene_locs: {technology}/regulatory_potential/gene_locs +reg_potential_dataset_ids: {technology}/regulatory_potential/dataset_ids +reg_potential_matrix : {technology}/regulatory_potential/rp_matrix +binned_reads : {technology}/binned_reads/{dataset_id} +technologies : Dnase,H3K27ac,ChIP-seq_RP,Motif_RP + +[factor_binding] +dataset_ids: {technology}_dataset_ids +technologies: ChIP-seq,Motif +matrix: {package_path}/data/{species}/{technology}_binding.npz + +[metadata] +DNase: {package_path}/data/{species}/metadata/lisa_meta.tsv +H3K27ac: {package_path}/data/{species}/metadata/lisa_meta.tsv +ChIP-seq: {package_path}/data/{species}/metadata/lisa_meta.tsv +Motifs: {package_path}/data/{species}/metadata/motifs_meta.tsv + +[RP_map] +matrix: {package_path}/data/{species}/RP_map.npz + +[paths] +h5_path: {package_path}/data/{species}/lisa_data_{species}_reads_normalized.h5 +metadata: {package_path}/data/{species}/lisa_metadata.tsv +dataset_version: {package_path}/data/{species}_version.txt + +[lisa_params] +num_anova_selected: 200 +num_datasets_selected: 10 +background_genes: 3000 +max_user_genelist_len: 500 +background_strategies: regulatory,random,provided +num_bins: 3209513 +bin_length: 1000 +isd_methods: chipseq,motifs + +[downloads] +hg38_2.0: http://cistrome.org/~alynch/data/lisa_data/hg38_2.0.tar.gz +mm10_2.0: http://cistrome.org/~alynch/data/lisa_data/mm10_2.0.tar.gz \ No newline at end of file diff --git a/lisa/gene_selection.py b/lisa/gene_selection.py new file mode 100644 index 0000000..47280c7 --- /dev/null +++ b/lisa/gene_selection.py @@ -0,0 +1,309 @@ + +import numpy as np +import os +import random +from collections import Counter, defaultdict, OrderedDict +import random + +class Gene: + ''' + A gene has a unique genomic region, an accepted name, and aliases that also correspond to that gene or genomic region + ''' + def __init__(self, chrom, tss_start, tss_end, names, tad_domain = None): + self.chrom = chrom + self.start = int(tss_start) + self.end = int(tss_end) + self.aliases = [] + self.tad_domain = tad_domain + if isinstance(names, str): + self.add_alias(names) + else: + self.add_aliases(names) + self.location = ':'.join([str(property_) for property_ in (self.chrom, self.start, self.end)]) + + def add_alias(self, alias, is_name = True): + + if not alias in self.aliases: + self.aliases.append(alias) + + def get_name(self): + return self.aliases[0] + + def add_aliases(self, aliases): + assert( isinstance(aliases, list) ) + for alias in aliases: + self.add_alias(alias) + + def get_location(self): + return self.location + + def __eq__(self, other): + if isinstance(other, str): + return other in self.aliases + elif isinstance(other, Gene): + return self.get_location() == other.get_location() + else: + return False + + def __repr__(self): + return '\t'.join([str(x) for x in [self.get_location(), self.aliases[0], self.tad_domain, '|'.join(self.aliases)]]) + + def __str__(self): + return self.get_name() + + + def get_RP_signature(self, bins, bin_index, delta = 10000, max_influence_distance = 100000): + + tss = self.start + + #find bin regions defining interesting RP region + min_bin, split_bin, max_bin = np.digitize([tss - max_influence_distance, tss, tss + max_influence_distance], bins) + #subset interesting region of chrom + bin_indices = bin_index[min_bin: max_bin - 1] + #split the bin holding the TSS into two bins + bins = np.concatenate([bins[min_bin: split_bin], [tss], bins[split_bin: max_bin]]) + split_bin -= min_bin + tss_bins = (split_bin - 1, split_bin) + + #get bin intervals to the left and right of TSS, then concatenate + left_bins = np.abs(np.array(list(zip(bins[1:tss_bins[1] + 1], bins[:tss_bins[0] + 1]))) - tss) + right_bins = np.abs(np.array(list(zip(bins[tss_bins[1]:-1], bins[tss_bins[1] + 1:]))) - tss) + intervals = np.concatenate([left_bins, right_bins], axis = 0) + + #get integral of RP at boundary locations + RP = intervals * (-np.log(1/3) / delta) + RP = 2 * ( RP - np.log(np.exp(RP) + 1)) + #compute RP over area + RP = np.subtract(RP[:,1], RP[:,0]) + #sum bins split by TSS + summed_tss_rp = RP[list(tss_bins)].sum() + RP[tss_bins[0]] = summed_tss_rp + #remove split bin + RP = np.delete(RP, tss_bins[1]) + #normalize so bin with TSS has RP of 1 + RP = RP / RP.max() + + return RP, bin_indices + + +class GeneSet: + ''' + Enforces gene organization rules: + A genomic location may correspond to many names + A name may correspond to many locations + Primary organization should be by genomic location + ''' + + def __init__(self): + self.genes_by_name = defaultdict(list) + self.genes_by_chr = OrderedDict() + + def add_genes(self, new_genes): + for new_gene in new_genes: + self.add_gene(new_gene) + + return self + + def add_gene(self, new_gene): + + #finds if gene location is already inhabited + if new_gene.get_location() in self.genes_by_chr: + #get existing gene object + existing_gene = self.genes_by_chr[new_gene.get_location()] + #adds the new names to the existing object for this genomic location / gene + existing_gene.add_aliases(new_gene.aliases) + #adds pointers from these names to the existing gene + for alias in new_gene.aliases: + #if this alias is not registered + if not alias in self.genes_by_name: + #add the location under the alias + self.genes_by_name[alias.upper()].append(existing_gene) + + else: + #add this gene under its genomic location + self.genes_by_chr[new_gene.get_location()] = new_gene + + #for its names, add this location + for alias in new_gene.aliases: + #uppercase the gene name so that capitalization is not a factor + self.genes_by_name[alias.upper()].append(new_gene) + + return self + + def get_symbols(self): + return [gene.get_name() for gene in self] + + def get_locations(self): + return [gene.get_location() for gene in self] + + def get_distinct_genes_by_symbol(self, excluding = set()): + + names = self.get_symbols() + + distinct_names = set(names).difference(excluding) + + distinct_genes = GeneSet() + for dinstinct_name in distinct_names: + distinct_genes.add_gene(self.get_gene_by_name(dinstinct_name)) + + return distinct_genes + + + def get_gene_by_name(self, name): + + name = name.upper() + if name not in self.genes_by_name: + raise KeyError() + else: + return self.genes_by_name[name][0] + + + def __str__(self): + return '\t'.join(['location','gene_name','tad_domain','aliases']) + '\n' + '\n'.join([repr(gene) for gene in self.genes_by_chr.values()]) + + def from_str(self, save_str): + + lines = save_str.split('\n') + #skip header line + for line in lines[1:]: + location, name, tad, aliases = [x.strip() for x in line.split('\t')] + new_gene = Gene(*location.split(':'), aliases.split('|'), tad_domain = tad) + self.add_gene(new_gene) + + return self + + + def get_genes_by_chrom(self, chromosome): + return [ + gene for location_key, gene in self.genes_by_chr.items() if location_key.split(':')[0] == chromosome + ] + + + def __len__(self): + return len(self.genes_by_chr) + + def __iter__(self): + return iter(list(self.genes_by_chr.values())) + + + def match_user_provided_genes(self, user_genelist): + rejects = [] + selected_genes = GeneSet() + for gene_candidate in user_genelist: + try: + selected_genes.add_gene( self.get_gene_by_name(gene_candidate) ) + except KeyError: + rejects.append(gene_candidate) + + return selected_genes.get_distinct_genes_by_symbol() + + + def random_sample(self, sample_num, seed = None): + if not seed is None: + np.random.seed(seed) + + assert(len(self) > sample_num), 'Background gene list provided must contain more than {} genes'.format(str(sample_num)) + + #if same number of genes, skip sample + if len(self) == sample_num: + return self + + else: + return GeneSet().add_genes(np.random.choice(sorted(list(self), key = lambda x : x.location), sample_num, replace = False)) + + #enforce sampling according to the TAD distribution of the genome. + def sample_by_TAD(self, sample_num, seed = None): + + if not seed is None: + np.random.seed(seed) + + #intersect background gene list with TAD_list to eliminate reserved genes, sorting to maintain order for seed repeatability + TAD_data = [(gene, gene.tad_domain) for gene in sorted(self, key = lambda x : x.location)] + + #collect list of genes in each bin + genes_in_TAD = defaultdict(list) + for gene, tad_group in TAD_data: + genes_in_TAD[tad_group].append(gene) + + #calculates number of genes in each TAD group + num_genes_in_TAD = {tad_group : len(genes) for tad_group, genes in genes_in_TAD.items()} + + #calculates the number of genes expected to be sampled from each TAD, with a bit of an enrichment (1.1x). Ensure all TADs have >= 1 expected genes + expected_samples = { + tad_group : max(1, int(num_genes / len(TAD_data) * sample_num * 1.1)) + for tad_group, num_genes in num_genes_in_TAD.items() + } + + #samples the expected number of genes from the TAD (or the number of genes if expected is > actual) + selected_genes = GeneSet() + for tad_group, num_expected in expected_samples.items(): + sampled_genes = np.random.choice(sorted(genes_in_TAD[tad_group], key = lambda x : x.location), min(num_genes_in_TAD[tad_group] - 1, num_expected), replace = False) + selected_genes.add_genes(sampled_genes) + + return selected_genes + + +#creates a label vector, indexed by gene symbol, with binary labels: {0 = background, 1 = query} +def create_label_dictionary(query_list, background_list): + return { #this dict is used to create label vector for LISA algo + gene.get_location() : int(i < len(query_list)) + for i, gene in enumerate(list(query_list) + list(background_list)) + }, dict( #return this dict for results (purely informational) + query_symbols = query_list.get_symbols(), + background_symbols = background_list.get_symbols(), + query_locations = query_list.get_locations(), + background_locations = background_list.get_locations(), + ) + +#converts user-entered data to two gene lists: a query list, and a background list +def select_genes(query_list, gene_set, num_background_genes = 3000, + background_strategy = 'regulatory', max_query_genes = 200, background_list = [], seed = None): + ''' + query_list: text-processed list of symbols or refseqIDs (or both) that user supplies + num_background_genes: number of background genes to select + background_strategy: regulatory, provided, or random background selection strategy + user_background_genes: text-processed list of symbols or refseqIDs (or both) that user supplies as candidate background genes + max_query_genes: config setting + background_list: text-processed list of symbols or refseqIDs (or both) + ''' + query_genes = gene_set.match_user_provided_genes(query_list) + + assert(20 <= len(query_genes) <= max_query_genes), 'User must provide list of 20 to {} unique genes. Provided {}'\ + .format(str(max_query_genes), str(len(query_genes))) + + if background_strategy == 'provided': + + background_matches = gene_set.match_user_provided_genes(background_list) + + background_genes = background_matches.get_distinct_genes_by_symbol(excluding = query_genes.get_symbols()) + + assert( len(background_genes) > len(query_genes) ) + + else: + + assert( num_background_genes >= len(query_genes) and len(query_genes) <= 19000//2 ), "More query genes selected than background genes" + + background_candidates = gene_set.get_distinct_genes_by_symbol(excluding = query_genes.get_symbols()) + + assert(len(background_candidates) >= num_background_genes), 'Number of background candidates must be greater than or equal number of genes to select as background.' + + #if no down-sampling is needed: + if len(background_candidates) == num_background_genes: + background_genes = background_candidates + + elif background_strategy == 'regulatory': + + background_genes = background_candidates.sample_by_TAD(num_background_genes, seed = seed) + + if len(background_genes) > num_background_genes: + background_genes = background_genes.random_sample(num_background_genes, seed = seed) + + elif background_strategy == 'random': + background_genes = background_candidates.random_sample(num_background_genes, seed = seed) + + else: + raise AssertionError('Background selection strategy {} not supported'.format(background_strategy)) + + #print(type(query_genes), len(query_genes), type(background_genes), len(background_genes)) + + return create_label_dictionary(query_genes, background_genes) \ No newline at end of file diff --git a/lisa/lisa.py b/lisa/lisa.py new file mode 100644 index 0000000..dc55419 --- /dev/null +++ b/lisa/lisa.py @@ -0,0 +1,379 @@ + +#lisa modules +import lisa.gene_selection as gene_selection +import lisa.assays as assays +from lisa.utils import LoadingBar, Log, LISA_Results +from lisa.models import LR_BinarySearch_SampleSelectionModel +from lisa.models import LR_ChromatinModel +from lisa._version import __version__ + +#standard library +import configparser +from collections.abc import Iterable +import sys +import json +import multiprocessing +from urllib import request +import tarfile +import os + +#required +import numpy as np +import h5py as h5 +from scipy import sparse + +PACKAGE_PATH = os.path.dirname(__file__) +CONFIG_PATH = os.path.join(PACKAGE_PATH, 'config.ini') +REQURED_DATASET_VERSION = '.'.join(__version__.split('.')[:2]) + +_config = configparser.ConfigParser() +_config.read(CONFIG_PATH) + +class DownloadRequiredError(BaseException): + pass + +class LISA: + ''' + The LISA object is the user's interface with the LISA algorithm. It holds the parameters specified by the user and + handles data loading from hdf5 + ''' + #class dictionary, converts cmd line alias to standard symbols for knockout methods + factor_binding_technologies = dict( + chipseq = 'ChIP-seq', + motifs = 'Motifs' + ) + + def __init__(self, species, + cores = 1, + isd_method = 'chipseq', + num_datasets_selected_anova = 200, + num_datasets_selected = 10, + verbose = True, + oneshot = False, + log = None, + ): + #all paramter checking is done on LISA instantiation to ensure consistency between python module and cmd line usage + self.isd_options = _config.get('lisa_params', 'isd_methods').split(',') + self.background_options = _config.get('lisa_params', 'background_strategies').split(',') + self.max_query_genes = int(_config.get('lisa_params', 'max_user_genelist_len')) + + assert( isinstance(num_datasets_selected, int) ) + assert( isinstance(num_datasets_selected_anova, int) ) + + self.num_datasets_selected_anova = num_datasets_selected_anova + self.num_datasets_selected = num_datasets_selected + + self._set_cores(cores) + + assert( num_datasets_selected_anova > num_datasets_selected ), 'Anova must select more datasets than the regression model' + assert( num_datasets_selected_anova > 0 and num_datasets_selected > 0 ), 'Number of datasets selected must be positive' + assert(num_datasets_selected_anova < 500) + assert(num_datasets_selected < 25) + + assert( isd_method in self.isd_options ), 'ISD method must be \{{}}'.format(', '.join(self.isd_options)) + assert( species in ['mm10','hg38'] ), "Species must be either hg38 or mm10" + + self.isd_method = self.factor_binding_technologies[isd_method] + self.species = species + + self.data_source = _config.get('paths', 'h5_path').format(package_path = PACKAGE_PATH, species = self.species) + + self.data_path = os.path.join(PACKAGE_PATH, 'data') + + #use provided log object or create a new one. Applications which call LISA may want to pass in their own log object for control of indentation + if log is None: + self.log = Log(sys.stderr, verbose = verbose) + else: + assert( isinstance(log, Log) ) + self.log = log + + #data has not yet been loaded + self._is_loaded = False + + self.assays = [] + + assert( isinstance(oneshot, bool )) + self.oneshot = oneshot + + self.used_oneshot = False + + + def _set_cores(self, cores): + assert( isinstance(cores, int) and cores >= -1) + #leave one core out for the rest of us + max_cores = multiprocessing.cpu_count() - 1 + if cores <= -1: + cores = max_cores + #use the minimum number of cores between the user's specificaion, the number of datasets to be processed in parallel, and the number of cores on the machine. + #this prevents LISA from using more resources than required. + self.cores = min(cores, max_cores, self.num_datasets_selected) + + def _preprocess_gene_list(self, genes): + #make sure gene lists are composed of stripped strings + return [str(gene).strip() for gene in genes if len(gene) > 0] + + def _load_gene_info(self): + + all_genes = gene_selection.GeneSet() + + with open(_config.get('genes','master_gene_list').format(package_path = PACKAGE_PATH, species = self.species), 'r') as genes: + all_genes.from_str(genes.read()) + + with open(_config.get('genes','gene_locs').format(package_path = PACKAGE_PATH, species = self.species), 'r') as f: + rp_map_locs = np.array([line.strip() for line in f.readlines()]) + + return all_genes, rp_map_locs + + + def _load_factor_binding_data(self, data_object): + + self.factor_dataset_ids = list(data_object[_config.get('accessibility_assay','reg_potential_dataset_ids')\ + .format(technology = self.isd_method)][...].astype(str)) + + return sparse.load_npz(_config.get('factor_binding','matrix').format(package_path = PACKAGE_PATH, species = self.species, technology = self.isd_method)) + + def _load_rp_map(self, data_object): + + return sparse.load_npz(_config.get('RP_map','matrix').format(package_path = PACKAGE_PATH, species = self.species)).tocsr() + + + def add_assay(self, data_object, gene_mask, assay): + assay.load(data_object, gene_mask=gene_mask, oneshot=self.oneshot) + self.assays.append(assay) + + + def _load_data(self, gene_mask = None): + + if self._is_loaded: + raise AssertionError('Data is already loaded') + + #if not already set, load this data + try: + self.rp_map_locs + except AttributeError: + self.all_genes, self.rp_map_locs = self._load_gene_info() + + with self.log.section('Loading data into memory (only on first prediction):') as log: + + with h5.File(self.data_source, 'r') as data: + + with log.section('Loading binding data ...') as log: + self.factor_binding = self._load_factor_binding_data(data) + + log.append('Loading regulatory potential map ...') + self.rp_map = self._load_rp_map(data) + + #Add assays to LISA's steps. Each assay follows the same instantiation and prediction calling, making them modular and substitutable. + #Adding an assay loads the required data for that assay + + #The first assay to be loaded must be the ChIP-seq or Motif direct knockout, because this assay supplies the + #metadata for the final output table since it holds information on all factor binding samples + self.add_assay(data, gene_mask, + assays.PeakRP_Assay(self.isd_method, _config, self.cores, self.log, + assays.LISA_RP_Assay.invitro_metadata if self.isd_method == 'ChIP-seq' else assays.LISA_RP_Assay.insilico_metadata, + _config.get('metadata',self.isd_method).format(package_path = PACKAGE_PATH, species = self.species, technology =self.isd_method), + ) + ) + + self.add_assay(data, gene_mask, + assays.Accesibility_Assay('DNase', _config, self.cores, self.log, + _config.get('metadata','DNase').format(package_path = PACKAGE_PATH, species = self.species, technology = 'DNase'), + rp_map = self.rp_map, factor_binding = self.factor_binding, + selection_model = LR_BinarySearch_SampleSelectionModel(self.num_datasets_selected_anova, self.num_datasets_selected), + chromatin_model = LR_ChromatinModel({'C' : list(10.0**np.arange(-2,4.1,0.5))}, penalty = 'l2') + ) + ) + + self.add_assay(data, gene_mask, + assays.Accesibility_Assay('H3K27ac', _config, self.cores, self.log, + _config.get('metadata', 'H3K27ac').format(package_path = PACKAGE_PATH, species = self.species, technology = 'H3K27ac'), + rp_map = self.rp_map, factor_binding = self.factor_binding, + selection_model = LR_BinarySearch_SampleSelectionModel(self.num_datasets_selected_anova, self.num_datasets_selected), + chromatin_model = LR_ChromatinModel({'C' : list(10.0**np.arange(-2,4.1,0.5))}, penalty = 'l2') + ) + ) + + self.log.append('Done!') + + self._is_loaded = True + + + def _check_for_data(self): + if not (os.path.isdir(self.data_path) and os.path.isdir(os.path.join(self.data_path, self.species))): + self.log.append('Data not found, must download from CistromeDB ...') + raise DownloadRequiredError() + + else: + with open(_config.get('paths','dataset_version').format(package_path = PACKAGE_PATH, species = self.species), 'r') as v: + dataset_version = v.read().strip() + + if not REQURED_DATASET_VERSION == dataset_version: + self.log.append('Dataset version mismatch, must download new dataset from CistromeDB ...') + + + def _download_data(self): + + print(__file__) + print('here') + + assert(False) + + with self.log.section('Grabbing {} data (~15 minutes):'.format(self.species)): + + self.log.append('Downloading from database ...') + #make data directory if does not exist + if not os.path.isdir(self.data_path): + os.mkdir(self.data_path) + + #http://cistrome.org/~alynch/data/genes.tar.gz + download_dataset = _config.get('downloads','{species}_{version}'.format(species = self.species, version = REQURED_DATASET_VERSION)) + + filename, _ = request.urlretrieve( + download_dataset, + os.path.join(self.data_path, self.species + '_data.tar.gz') + ) + + self.log.append('Extracting data ...') + + with tarfile.open(filename) as tar: + tar.extractall(path = self.data_path) + + os.remove(filename) + + with open(_config.get('paths','dataset_version').format(package_path = PACKAGE_PATH, species = self.species), 'w') as v: + v.write(REQURED_DATASET_VERSION) + + self.log.append('Done!\n') + + @staticmethod + def _combine_tests(p_vals, weights=None): + #https://arxiv.org/abs/1808.09011 + #combine p-values from all assays + p_vals = np.array(p_vals).astype(np.float64) + + assert(len(p_vals.shape) == 2 ), 'P-values must be provided as matrix of (samples, multiple p-values)' + assert(p_vals.shape[1] > 1), 'Must have multiple p-values to combine' + + #clip p-values at minimum float64 value to prevent underflow + p_vals = np.clip(p_vals, np.nextafter(0, 1, dtype = np.float64), 1.0) + if weights is None: + weights = np.ones((1, p_vals.shape[1])) / p_vals.shape[1] + else: + assert(p_vals.shape[1] == weights.shape[1]) + + test_statistic = np.sum(weights * np.tan((0.5-p_vals) * np.pi), axis = 1) + combined_p_value = 0.5 - np.arctan(test_statistic)/np.pi + + return combined_p_value + + #process user-supplied text + def _make_gene_mask(self, query_list, background_list, num_background_genes, background_strategy, seed = None): + ''' + query_list: list of any text that may be a query gene + background_list : list of any text that may be a background gene + These lists are lightly processed, then matched with possible genes in the lisa database + + returns: + gene_mask: mask for which genes in lisa were selected for query and background + label_vector: query or background set membership for each of those genes + gene_info_dict: gene info for results printout + ''' + query_list = self._preprocess_gene_list(query_list) + background_list = self._preprocess_gene_list(background_list) + + #match user-supplied text with genes, and select background genes + label_dict, gene_info_dict = gene_selection.select_genes(query_list, self.all_genes, + num_background_genes = num_background_genes, background_strategy = background_strategy, + max_query_genes = self.max_query_genes, background_list = background_list, seed = seed) + + #Show user number of query and background genes selected + self.log.append('Selected {} query genes and {} background genes.'\ + .format(str(len(gene_info_dict['query_symbols'])), str(len(gene_info_dict['background_symbols'])))) + + #subset the rp map for those genes and make label vector + gene_mask = np.isin(self.rp_map_locs, list(label_dict.keys())) + #make label vector from label_dict based of gene_loc ordering in rp_map data + label_vector = np.array([label_dict[gene_loc] for gene_loc in self.rp_map_locs[gene_mask]]) + + return gene_mask, label_vector, gene_info_dict + + def predict(self, query_list, + background_list = [], + background_strategy = 'regulatory', + num_background_genes = 3000, + seed = None, + ): + + if self.oneshot and self.used_oneshot: + raise AssertionError('When instantiated in one-shot, model cannot be used for multiple predictions') + + if background_list is None: + background_list = [] + assert( isinstance(num_background_genes, int) ) + assert( background_strategy in self.background_options ), 'Background strategy must be in \{{}\}'.format(', '.join(self.background_options)) + assert( isinstance(query_list, Iterable) ) + assert( isinstance(background_list, Iterable)) + + try: + self._check_for_data() + except DownloadRequiredError: + self._download_data() + self._check_for_data() + + try: + self.log.append('Using {} cores ...'.format(str(self.cores))) + + if not self._is_loaded: + self.all_genes, self.rp_map_locs = self._load_gene_info() + + with self.log.section('Matching genes and selecting background ...'): + + if len(background_list) > 0 and background_strategy == 'provided': + self.log.append('User provided background genes!') + + gene_mask, label_vector, gene_info_dict = self._make_gene_mask(query_list, background_list, num_background_genes, background_strategy, seed = seed) + + #based on genes selected, only subset of data can be loaded + if not self._is_loaded: + self._load_data(gene_mask) + #this method sets _is_loaded to True + + with h5.File(self.data_source, 'r') as data: + + #run each assay specified in the _load_data step. Each assay returns a p_value for each factor binding sample, and returns metadata + #each assay takes the same arguments and makes the same returns but can perform different tests + assay_pvals, assay_info = {},{} + for assay in self.assays: + assay_pvals[assay.technology + '_p_value'] = assay.predict(gene_mask, label_vector, data_object = data) + assay_info[assay.technology + '_model_info'] = assay.get_info() + + except (FileNotFoundError, OSError): + raise DownloadRequiredError('Data is malformed or incomplete, run "lisa download [species]" to redownload dataset') + + #combine p-values from all assays on a per-sample basis + with self.log.section('Mixing effects using Cauchy combination ...'): + + aggregate_pvals = np.array(list(assay_pvals.values())).T + + combined_p_values = self._combine_tests(aggregate_pvals) + + self.log.append('Formatting output ...') + #instantiate a lisa results object + results = LISA_Results.fromdict( + **self.assays[0].get_metadata(), + combined_p_value = combined_p_values, + combined_p_value_adjusted = list(np.minimum(np.array(combined_p_values) * len(combined_p_values), 1.0)), + **assay_pvals + ) + + results = results.sortby('combined_p_value_adjusted', add_rank = True) + + self.log.append('Done!') + + #if using in one-shot mode, prevent user from calling "predict" on this object again + self.used_oneshot = True + #return model_metadata as big dictionary + return results, dict( + **gene_info_dict, + **assay_info + ) \ No newline at end of file diff --git a/lisa/models.py b/lisa/models.py new file mode 100644 index 0000000..3353525 --- /dev/null +++ b/lisa/models.py @@ -0,0 +1,179 @@ +""" +This file defines the ML models used in Lisa. The class hierarchy is shown below (* = applied directly by Lisa): + +BaseEstimator --> EstimatorInterface --> SampleSelectionModel --> LR_BinarySearch_SampleSelectionModel* + | + | + --> ChromatinModel --> LR_ChromatinModel* + +The EstimatorInterface enforces "fit" and "get_info" functions, and the SampleSelectionModel and ChromatinModel enforce functions +for the selection of discriminative datasets, and the training of an accurate chromatin model, respectively. + +Users may extend the SampleSelectionModel or ChromatinModel class to define new behavior for Lisa. +""" + +from sklearn.base import BaseEstimator +from sklearn.model_selection import GridSearchCV +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler +from sklearn.feature_selection import SelectKBest, f_classif +import numpy as np +from sklearn.exceptions import ConvergenceWarning +import warnings +warnings.filterwarnings("ignore", category=ConvergenceWarning) + + +class EstimatorInterface(BaseEstimator): + #Wraps the sklearn Estimator class, and enforces that the user implement a "fit" and "get_info" method + def fit(self): + raise NotImplementedError("Must implement \"fit\" method of ChromatinModel") + def get_info(self): + raise NotImplementedError("Must implement \"get_info\" method of ChromatinModel") + + +class SampleSelectionModel(EstimatorInterface): + """ + Extends the EstimatorInterface class, and enforces "get_selected_datasets" and "get_num_selected_datasets" methods + get_selected_datasets should return a boolean index of the "selected status of each dataset in the data matrix + """ + def get_selected_datasets(self): + raise NotImplementedError() + def get_num_selected_datasets(self): + raise NotImplementedError() + +class LR_BinarySearch_SampleSelectionModel(SampleSelectionModel): + """ + Uses logistic regression to select n best discriminatory datasets. Uses binary search to tune the regularization parameter. + max_iters: number of binary_search iterations to run, max + num_datasets_selected: find best n datasets + epsilon: tolerance. If dataset's regression coef is > epsilon, it is considered "selected" + penalty: l1, use l1 penalty to choose datasets + tol: tolerance, stop condition for LR model + penalty_range: high and low boundaries for the C regularization parameter of the LR model. + C = 2**-penalty, and C is inverse of regularization. Therefor for range [-1, 10], the minimum C = 2**1 = 2, very slightly regularized, + and C = 2**-10, or very highly regularized. Binary search finds the optimal parameter in this space + """ + def __init__(self, num_anova_features, num_datasets_selected, max_iters = 50, + epsilon = 1e-7, penalty = 'l1', tol = 0.01, penalty_range = (-1, 10)): + self.max_iters = max_iters + self.num_datasets_selected = num_datasets_selected + self.epsilon = epsilon + self.penalty = penalty + self.tol = tol + self.penalty_range = penalty_range + self.num_anova_features = num_anova_features + + #binary search to close in on optimal penalty value + def _binary_search(self, X, y, low, high, iter_num = 0): + penalty = (high - low) / 2 + low + self.model.C = 2**-penalty + self.model.fit(X,y) + #get upweighted datasets + num_datasets_selected = self.get_num_selected_datasets() + + #break if the desired amount of datasets were used or max iters reached + if num_datasets_selected == self.num_datasets_selected or iter_num == self.max_iters: + return self + + return self._binary_search(X, y, penalty, high, iter_num + 1) if num_datasets_selected > self.num_datasets_selected \ + else self._binary_search(X, y, low, penalty, iter_num + 1) + + + #returns binary index of selected datasets + def get_selected_datasets(self): + return np.squeeze(np.abs(self.model.coef_) > self.epsilon) + + #returns the number of selected datasets + def get_num_selected_datasets(self): + return self.get_selected_datasets().sum() + + #instantiates a new LR model, then tunes the C parameter to get n datasets + def fit(self, rp_matrix, labels): + + # Normalize with mean centering and log transformation + index_array = np.arange(rp_matrix.shape[1]) + + X = StandardScaler(with_std = False).fit_transform( np.log2(rp_matrix + 1) ) + + #narrow features with anova selection + anova_featues = SelectKBest(f_classif, k=self.num_anova_features).fit(X, labels).get_support() + #get indices mapping anova features to original features + index_array = index_array[anova_featues] + #subset selected features + X = X[:, anova_featues] + + #instantiate LR search model + self.model = LogisticRegression(penalty = self.penalty, tol = self.tol, solver = 'liblinear', random_state = 777) + #use binary search technique to find datasets + self._binary_search(X, labels, *self.penalty_range) + #get boolean mask for selected datasets + lr_selections = self.get_selected_datasets() + #map LR selections back to original features as boolean mask + dataset_selections = index_array[lr_selections] + + return dataset_selections + + #gets information on parameters chosen by user, and parameters of LR model to save to json logs + def get_info(self): + return dict( + search_params = self.get_params(), + search_model_params = self.model.get_params(), + dataset_coefs = list(np.squeeze(self.model.coef_)), + ) + + +class ChromatinModel(EstimatorInterface): + """ + Enforces that a chromtin model class implements get_deltaRP_activation + """ + def get_deltaRP_activation(self, X): + raise NotImplementedError() + + +class LR_ChromatinModel(ChromatinModel): + """ + Wrapper for the DeltaLR model defined above. Fits the best LR classifier using grid search + """ + def __init__(self, param_grid, kfold_cv = 4, scoring = 'roc_auc', penalty = 'l1'): + self.kfold_cv = kfold_cv + self.param_grid = param_grid + self.scoring = scoring + self.penalty = penalty + + def fit(self, rp_matrix, labels, n_jobs = 1): + + self.rp_0 = rp_matrix + + self.normalizer = StandardScaler(with_std = False) + + X0 = self.normalizer.fit_transform( np.log2(self.rp_0 + 1) ) + + #define a classifier for query vs background genes + classifier = LogisticRegression(penalty = self.penalty, solver = 'lbfgs' if self.penalty == 'l2' else 'liblinear', random_state = 777) + #optimize model with grid search + self.grid_search = GridSearchCV(classifier, param_grid = self.param_grid, scoring = self.scoring, cv = self.kfold_cv, n_jobs = 1)\ + .fit(X0, labels) + + self.model = self.grid_search.best_estimator_ + + return self + + def get_deltaRP_activation(self, rp_knockout): + """ + rp_knockout: is a datacube of shape (genes, samples, TFs), + this method must implement a transformation into a genes x TFs matrix, sumarrizing the dataset-axis effects + """ + #subtract define deltaX to be the log2 of the fraction of knocked-out RP + deltaX = np.log2(self.rp_0[:,:,np.newaxis] - rp_knockout + 1) - np.log2(self.rp_0[:,:,np.newaxis] + 1) + + #flip sign so that more knockout = more deltaR + return -1 * deltaX.transpose(0,2,1).dot(self.model.coef_.reshape(-1)) + + + def get_info(self): + return dict( + model_params = self.model.get_params(), + coefs = list(np.squeeze(self.model.coef_)), + auc_score = self.grid_search.best_score_ + ) + diff --git a/lisa/tests/__init__.py b/lisa/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lisa/tests/gene_selection_tests.py b/lisa/tests/gene_selection_tests.py new file mode 100644 index 0000000..fb12048 --- /dev/null +++ b/lisa/tests/gene_selection_tests.py @@ -0,0 +1,138 @@ + + +import unittest +from unittest import TestCase +import configparser +from lisa import LISA +import lisa.gene_selection as gene_selection +import numpy as np +import lisa.models as models +import lisa.assays as assays +import lisa.utils as utils + +class TestGeneSelection(TestCase): + + @classmethod + def setUpClass(cls): + cls.lisa = LISA('hg38', verbose = 0) + cls.geneset, _ = cls.lisa._load_gene_info() + cls.query_list = 'PSG5,PSG3,GATA4,CD83,APOBEC2,DFNA5,S100A9,PSG4,SLC26A2,PDGFRA,HAS3,PRSS12,RGS8,S100A12,C8orf49,SLC9A7,TMEM65,PSG9,RGS5,SYTL5,CST1,PSG7,TNFAIP3,GPRC5B,CST4,CHN2,FAM135A,RYR2,RGN,RASGRP1,GCNT3,PSG6,APLF,SORBS2,GSTM4,TNIK,PPARGC1A,TM2D1,PRKD3,CYMP,ABCG2,ZNF114,GALC,SMIM24,SIPA1L2,GDPD1,MMD,PSG2,LINC01300,TMC5,FAM46C,THUMPD3-AS1,POLR1D,CDH11,TRABD2B,MBNL2,CACNA1D,GOLT1A,RPS29,FPR3,SELENOP,COL28A1,PSD3,HKDC1,UNC5CL,NLRP7,LINC00479,PTPRH,CCNYL1,SLC9A2,LINC01468,ELFN2,MECOM,PKP4,IGFBP5,KALRN,SSX2IP,LYZ,RBP2,ZNF704,RP11-248J18.2,TRPM7,PRR9,TMEM163,WSB1,NAV2,TMPRSS2,RP11-356I2.4,IGFL1,GDA,SGK1,C9orf152,CTH,RHOU,CALD1,HGSNAT,ZNF654,RRBP1,PRKCA,FAM134B' + cls.query_list = [gene.upper().strip() for gene in cls.query_list.split(',')] + + cls.background_list = 'ORC2,CHST4,LSAMP,CCDC36,OR10P1,POLA2,CD1D,TCN1,SLC27A4,STK19,CLINT1,GNB1L,AK9,WDR41,NEUROG2,DAPK1,C19orf52,HRH4,CASS4,H2BFM,OR51E2,NOTCH3,LOXL4,HEATR9,ITGBL1,ITPRIP,GNAQ,NGEF,MUC21,FAF2,B4GALT1,CT45A5,COPG2,DBX2,BRCA2,PIGA,OR4D1,ABCA2,AHSG,BEX2,PRMT2,OTOP2,NDUFB1,BHLHA9,PHB2,NBN,PHYHD1,SCPEP1,THUMPD2,UBL7,ARMCX3,VHL,SYCP2L,TRIM55,C19orf66,TRAT1,CCL3L3,C1orf115,EFNA3,NOS1,C3orf67,RASSF6,MROH7,MIOS,ACBD3,ZDHHC22,LSM1,MS4A6E,HIST1H2AL,PPTC7,PTPN20,MOV10L1,ACADVL,DDX41,CAPNS1,CES1,WDR13,LIN28A,TEX29,UBL4A,ZNF324,IRF8,DDX5,SLC25A12,GNG2,PCGF3,PTCD2,CNTN4,C5AR1,AKR1B15,ATXN3,SERPINB11,AGAP1,TXNDC5,SLC35G5,GPRIN1,LRRC10B,DCANP1,AKR1D1,MYH13,KATNAL2,COL4A4,GREM2,SLC10A7,SEMA4C,MRPL16,SURF1,ZNF33A,ADAMTS12,PLA2G2E,SLC16A3,VSTM2L,POMK,SIN3A,ZNF839,DHX9,TBATA,GPR25,TXNDC12,DCBLD1,ATP6AP2,ZNF670,SMAD5,GOLGA6A,IFI27L2,ZNF625,HOXD11,PARD6G,HOMER3,CEP120,BARHL2,KRT75,SCYL1,OR1F1,MOB3C,ABCB8,C10orf2,PRAC1,RNF114,CADM3,ZNF510,CD37,DYNC1H1,SLITRK4,EXOSC4,NELFCD,UQCC3,PURA,SLC35F4,GSTA5,CALR,LYN,FAM126B,RBM5,MRFAP1,MGAT4A,UBXN2B,PRDM7,NOL8,RBFOX3,LRRTM4,COL6A3,SGTA,KIF2B,CHST2,CDKN2D,VMO1,GBP3,DSG2,FMNL3,LGALS13,KRTAP9-9,CFAP65,PCDHGA9,ACOT12,CFL2,FGF23,OR4F4,PCDHB16,C9orf135,ENTPD2,EPN3,PNMA5,SNCAIP,SYNE3,REM1,DACT3,FAM83D,COMMD8,MAP2,TSPAN9,RHPN1,IER5L,GOLGA8H,TMEM67,RALGDS,VTCN1,PSMB1,ALOX12,SMARCA1,PIGQ,MTRNR2L5,IRGM,LOC389199,CDC73,FGFBP2,DUSP13,RAPH1,UBE2D2,FOXP4,PCDHB6,SCAMP1,ADAMTS5,SPINT2,VPS53,SGK494,MSH6,RFX5,BRAT1,KIAA1033,LRRTM1,TMEM41B,TREH,STXBP5L,MYO15A,ZNF48,INS-IGF2,C10orf111,NMNAT2,DUSP22,RBP5,CARD16,OIT3,TNFRSF12A,CLUH,ZNF641,ZNF846,CLEC9A,ARMC7,SEMA4G,GLRX3,TKFC,TOR3A,PEX7,DIAPH1,MEI4,MAP4K2,GGACT,SLC41A3,ISY1-RAB43,OR5AS1,LRRC75B,AGBL4,OSR1,SOCS7,HTR3D,THEMIS,CCDC60,RAB19,OR5T2,FABP2,SLC22A24,CIB4,PSMD11,PHC1,TEX11,PUS3,CSNK2A3,PSME3,DDRGK1,EEF2K,OR4C46,KLHL6,MYH6,IDH3B,LRRC69,NBPF1,DSTN,CCDC174,CYB5A,CT47A2,IGFBP3,GLI1,ARSD,ZNF579,CAPN2,SLC2A9,MARCH7,GIP,CD99,TIRAP,RNF138,DUOXA2,SPANXD,PDE7A,EXOC3,LAMA4,C8orf89,ADCY5,AVPR1A,SSSCA1,LRRC4,SLC3A1,NDUFC2,ZNF773,METTL12,C2orf74,KLK3,LOC100131107,UBXN4,MRPL53,PAPOLB,CFHR2,FLI1,HIST1H1C,ZBTB4,CCPG1,CNPY4,GADL1,ABHD2,FAM219A,MET,PIEZO2,ARHGEF15,OR51G1,RAX2,SMDT1,QRICH1,ITPR2,CCL1,PBXIP1,NEU2,FNDC3A,PLD5,RFWD3,LY6E,RDH16,MFAP1,SLC39A14,F8,NANOS1,PFDN6,C17orf75,SLFN5,KPNB1,DNA2,OR2T8,FAM65B,PIK3CD,MARCH6,SIK2,SIGLEC5,TLR3,C11orf88,CSNK1G3,TRPV1,TMEM80,FAM183A,SLC16A14,CCT5,RSPRY1,NKAIN2,SLC27A2,PPHLN1,FABP5,CAMTA1,STAMBP,C1orf106,ZNF354A,KRTAP4-9,CNTD2,KIF22,TM7SF2,ARMCX4,ZNF705G,TUBA3C,ADGRL4,NCF2,SERF1A,PPP4R1,SMARCD3,SFTPC,SELE,MYC,KLF8,YEATS2,TRAK1,MAN1A1,F12,FAM107B,EVI2B,CYP2D6,CLEC4G,C18orf54,SYNRG,DMTN,FGF8,FHOD1,SYPL1,BAIAP2L1,SELP,AKR7A2,SLC25A22,RABGGTA,DCUN1D3,AFG3L2,SNAP25,CEP95,ACSL6,OR4D2,OBP2B,ARPC3,CHP2,PEX11G,MROH9,RNF219,NEIL1,RASL12,TRAPPC3,ALDH18A1,RAB25,KDSR,PRSS38,LTA,FUT1,AXIN1,COG2,TNIP2,SLC15A1,CHURC1-FNTB,LYSMD1,REV1,TMEM200C,MYL2,USP26,DLG2,BFSP1,PATL2,FUT5,PRR12,ACSS1,BTF3,PILRB,CSGALNACT2,FOXO3,ZNF506,CADM2,GPR63,PYCRL,RNF10,RPAP3,HSPB8,PAIP2B,VPS4A,CFHR3,CDC123,ANKUB1,CD1B,TVP23A,ZNF587B,NPTXR,APLP2,SPTLC2,FBXL3,ZNF701,GOLGA8J,MOSPD2,NFASC,CNGA1,SLC36A1,LIN54,CRABP2,HEMGN,UGT1A4,OR6B2,CPNE2,CXorf40B,C2orf61,ZADH2,FAM129A,ZWINT,APOH,SHISA9,OGG1,RASSF7,RDH14,ISLR,GPR87,GAD1,ASTN2,WDR54,MRAP2,MT1B,C22orf46,UQCRQ,PHOX2A,MEPE,TRIM72,PEX13' + cls.background_list = [gene.upper().strip() for gene in cls.background_list.split(',')] + + def validate_sampling(self, label_dict, gene_info, num_genes = 3000): + + query_genes = [k for k, v in label_dict.items() if v] + background_genes = [k for k,v in label_dict.items() if not v] + + self.assertEqual(len(query_genes), 97) + self.assertEqual(len(background_genes), num_genes) + self.assertEqual(len(set(query_genes).intersection(set(background_genes))), 0) + self.assertEqual(len(set(gene_info['query_symbols']).intersection(set(gene_info['background_symbols']))), 0) + + def test_tad_background_sampling(self): + + label_dict, infodict = gene_selection.select_genes(self.query_list, self.geneset, seed = 0) + + self.validate_sampling(label_dict, infodict) + + def test_random_background_sampling(self): + + label_dict, infodict = gene_selection.select_genes(self.query_list, self.geneset, background_strategy= 'random') + + self.validate_sampling(label_dict, infodict) + + def test_background_provided(self): + + label_dict, infodict = gene_selection.select_genes(self.query_list, self.geneset, background_strategy='provided', + background_list= self.background_list) + + self.validate_sampling(label_dict, infodict, 499) + + def test_gene_matching(self): + + genes_returned = self.geneset.match_user_provided_genes(self.query_list) + + self.assertEqual(len(genes_returned), 97) + + +def make_random_labels(num_samples, num_pos): + label = np.zeros(num_samples) + label[np.random.choice(num_samples, num_pos, replace = False)] = 1 + label = label.astype(np.bool) + return label + + +class TestModels(TestCase): + + @classmethod + def setUpClass(self): + + self.dataset_label = make_random_labels(1000, 10) + self.gene_label = make_random_labels(3000, 100) + + matrix_shape = (self.gene_label.size, self.dataset_label.size) + + design_matrix = self.gene_label[:,np.newaxis] * self.dataset_label[np.newaxis, :] + + self.dataset_directionality = np.random.choice([-1.0,1.0], (1, matrix_shape[1])) # ~B(1/2) + signal = self.dataset_directionality * (np.random.normal(0,1, matrix_shape) + np.random.lognormal(0, 0.2, matrix_shape)) # ~ N(lognormal(0,0.5), 1) + background = np.random.normal(0, 1, matrix_shape) # ~N(0,1) + + rp_matrix = background + design_matrix * signal + self.rp_matrix = rp_matrix - rp_matrix.min() + + selector = models.LR_BinarySearch_SampleSelectionModel(200, 10) + + self.selections = selector.fit(self.rp_matrix, self.gene_label) + + def test_dataset_selection_model(self): + + dataset_label_index = np.where(self.dataset_label)[0] + + self.assertEqual(len(set(dataset_label_index).difference(set(self.selections))), 0) + + def test_chromatin_model(self): + + model_matrix = self.rp_matrix[:, self.selections] + + chrom_model = models.LR_ChromatinModel({'C' : list(10.0**np.arange(-2,4.1,0.5))}, penalty = 'l2') + + chrom_model.fit(model_matrix, self.gene_label) + + coefs = chrom_model.model.coef_ + + self.assertTrue(((self.dataset_directionality.reshape(-1)[self.selections] > 0) == (coefs.reshape(-1) > 0)).all()) + + test_datacube = np.random.rand(self.gene_label.size, self.dataset_label.size, 1000) + + self.assertTrue(chrom_model.get_deltaRP_activation(test_datacube).shape == (self.gene_label.size, 1000)) + + +class TestPeakRP_Assay(TestModels): + + def setUp(self): + + self.model = assays.PeakRP_Assay(None, None, 1, utils.Log(verbose = False)) + + def test_peak_rp_p_values(self): + + self.model.rp_matrix = self.rp_matrix + + pvals = self.model.predict(np.ones(len(self.gene_label)).astype(np.bool), self.gene_label) + + lowest_pvals = np.argsort(pvals) + + dataset_label = np.argwhere(np.logical_and(self.dataset_label.reshape(-1), self.dataset_directionality.reshape(-1) > 0)) + + self.assertTrue(np.isin(dataset_label, lowest_pvals[:10]).all()) + + +if __name__ == "__main___": + unittest.main() + + + + diff --git a/lisa/tests/instantiation_tests.py b/lisa/tests/instantiation_tests.py new file mode 100644 index 0000000..bc08f32 --- /dev/null +++ b/lisa/tests/instantiation_tests.py @@ -0,0 +1,27 @@ + + +import unittest +import lisa + +class AssertionTests(unittest.TestCase): + + def test_species_instantiation(self): + self.assertRaises(AssertionError, lisa.LISA, 'hg39') + + def test_cores(self): + self.assertRaises( AssertionError, lisa.LISA, 'hg38', -2) + self.assertRaises( AssertionError, lisa.LISA, 'hg38', 'all') + + def test_isd_method(self): + self.assertRaises(AssertionError, lisa.LISA, 'hg38',-1,'motifsorwhatever') + + def test_datasets(self): + args = (lisa.LISA, 'hg38',-1,'chipseq') + self.assertRaises(AssertionError, *args, 10, 200) + self.assertRaises(AssertionError, *args, 200, -10) + self.assertRaises(AssertionError, *args, 0, 0) + self.assertRaises(AssertionError, *args, 1000, 200) + + +if __name__ == "__main___": + unittest.main() \ No newline at end of file diff --git a/lisa/utils.py b/lisa/utils.py new file mode 100644 index 0000000..9588402 --- /dev/null +++ b/lisa/utils.py @@ -0,0 +1,168 @@ +from sys import stderr +from contextlib import contextmanager +from scipy import sparse +import numpy as np +from collections import defaultdict, OrderedDict + +def ragged_array_to_sparse_matrix(indices, values, col_length): + return sparse.hstack([ + sparse.csc_matrix( + (val_column, + (index_column, np.zeros(len(val_column)))), + shape = (col_length, 1) + ) + for index_column, val_column in zip(indices, values) + ]) + +class LoadingBar: + + def __init__(self, label, increments, length = 25, cold_start = False): + self.increments = increments + self.length = length + self.label = label + self.progress = 0 + self.cold_start = cold_start + + def __str__(self): + if self.cold_start: + self.cold_start = False + else: + self.increment() + completed_steps = int(self.progress / self.increments * self.length) + if completed_steps >= self.length: + return '{}: [{}]'.format(self.label, "="*completed_steps) + '\n' if self.is_finished() else '' + else: + return '{}: [{}>{}]'.format(self.label, "="*completed_steps, " "*(self.length - completed_steps - 1)) + + def increment(self): + if not self.is_finished(): + self.progress += 1 + + def is_finished(self): + return self.progress >= self.increments + + +class Log: + + def __init__(self, target = stderr, verbose = True): + self.target = target + self.indents = 0 + assert( isinstance(verbose, (int, bool))), 'Verbosity must be set to integer or boolean value.' + self.verbose = verbose + + @contextmanager + def section(self, header): + try: + self.start_section(header) + yield self + finally: + self.end_section() + + def start_section(self, section_header): + self.append(section_header) + self.indents += 1 + + def end_section(self): + self.indents -= 1 + + def append(self, text, end = '\n', update_line = False): + linestart = '\r' if update_line else '' + if (isinstance(self.verbose, bool) and self.verbose == True) or (isinstance(self.verbose, int) and self.indents < self.verbose): + print(linestart + '\t'*self.indents + str(text), + end = '' if update_line else end, + file = self.target) + + +class LISA_Results: + + @classmethod + def fromdict(cls, **kwargs): + d = kwargs + return LISA_Results(list(d.keys()), list(d.values())) + + def __init__(self, keys, columns): + self.results_headers = keys + self.results_rows = self._transpose_table(columns) + + def get_colnum(self, colname): + try: + return np.argwhere(np.array(self.results_headers) == colname)[0][0] + except IndexError: + raise IndexError('Column {} not in results table'.format(str(colname))) + + def sortby(self, key, add_rank = False): + + if isinstance(key, str): + key = self.get_colnum(key) + + self.results_rows = sorted(self.results_rows, key = lambda col : col[key]) + + if add_rank: + row_ranks = range(1, len(self) + 1) + try: + self.update_column('Rank', row_ranks) + except IndexError: + self.add_column('Rank', row_ranks, 0) + + return self + + def __len__(self): + return len(self.results_rows) + + @staticmethod + def _transpose_table(table): + return [list(l) for l in list(zip(*table))] + + def todict(self): + return dict( zip( self.results_headers, self._transpose_table(self.results_rows))) + + def update_column(self, name, data): + + colnum = self.get_colnum(name) + + for row, value in zip(self.results_rows, data): + row[colnum] = value + + + def add_column(self, name, data, column_num = 0): + + assert( not name in self.results_headers ), 'Column already exists' + + if column_num == -1: + self.results_headers.append(name) + for row, value in zip(self.results_rows, data): + row.append(value) + + else: + self.results_headers.insert(column_num, name) + for row, value in zip(self.results_rows, data): + row.insert(column_num, value) + + return self + + def subset(self, rows): + + if isinstance(rows, int): + rows = [rows] + + return LISA_Results(self.results_headers, self._transpose_table([ + self.results_rows[row] for row in rows + ])) + + def to_tsv(self, top_n = 200): + + output_lines = self.subset(range(top_n)) + return '\n'.join([ + '\t'.join([str(value) for value in line]) + for line in [output_lines.results_headers, *output_lines.results_rows] + ]) + + def filter_rows(self, filter_func, colname): + + colnum = self.get_colnum(colname) + subset_rows = [ + rownum for rownum, row in enumerate(self.results_rows) + if filter_func(row[colnum]) + ] + + return self.subset(subset_rows) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a33503a --- /dev/null +++ b/setup.py @@ -0,0 +1,26 @@ + + +from setuptools import setup + +with open('lisa/_version.py', 'r') as v: + version = v.read().split('=')[-1].strip() + +setup( + name = 'lisa', + description = """Lisa: inferring transcriptional regulators through integrative modeling of public chromatin accessibility and ChIP-seq data\n +X. Shirley Liu Lab, 2020""", + version = version, + url = 'https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-1934-6', + author = 'Allen Lynch', + author_email = 'alynch@ds.dfci.harvard.edu', + packages = ['lisa'], + zip_safe = False, + scripts = ['bin/lisa'], + install_requires = [ + 'numpy', + 'scipy', + 'h5py', + 'scikit-learn' + ], + include_package_data=True, +) \ No newline at end of file