From 9ec9f60dd3f66bbf8a6c59c87868c4e95511a5b8 Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 2 Sep 2021 11:28:40 -0400 Subject: [PATCH] add binning analysis --- .DS_Store | Bin 10244 -> 10244 bytes sceptre_paper/.DS_Store | Bin 14340 -> 14340 bytes .../setup_binning_analysis_7.R | 34 +++++++++++++--- ...6_V2.R => append_simple_names_results_6.R} | 4 +- .../analysis_drivers_xie/bulk_validation_7.R | 4 +- .../cis_enrichment_analysis_9.R | 8 ++-- .../select_gRNA_gene_pair_5.R | 2 + .../setup_binning_analysis_10.R | 38 +++++++++++++++--- .../significance_score_8.R | 20 ++++----- 9 files changed, 81 insertions(+), 29 deletions(-) rename sceptre_paper/analysis_drivers/analysis_drivers_xie/{append_simple_names_results_6_V2.R => append_simple_names_results_6.R} (95%) diff --git a/.DS_Store b/.DS_Store index 61aaf1755c3ca270c9f87024fe3ea142ba11cc6c..e34fb84a7a1ac9f7d65881aa604228116a88c68a 100644 GIT binary patch delta 56 zcmV-80LTA?P=rvh*%ATAlYS9^2o`&LFf1TBGm{At9s!q=C=wq5n3J#)9<#&~Lky9C O1+xbr`UJBE6$1ktA`zhg delta 53 zcmZn(XbISGQ;6}*Zv@@!^T_{FmM Ij|ej}0PGkMm;e9( diff --git a/sceptre_paper/.DS_Store b/sceptre_paper/.DS_Store index 3975cfa32f554b403e5147fc5a2168b38daac49b..fdaaac167d31504ff70ec31b1b0781d027de17af 100644 GIT binary patch delta 37 tcmZoEXeroGtuUEG>GWiN1;x!dinIA9Hn47HH~7XfdA_>iW@pu7OaK!f4gUZD delta 62 zcmV-E0Kxx+aD;HMdmsVKlOPg+4I6uVEiyDTEFdv4H#3uw86E+gldu^d0iBa@AtbYB UA)5}f2Qc;olYkQ^vmYhk0*Pr9iU0rr diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_gasp/setup_binning_analysis_7.R b/sceptre_paper/analysis_drivers/analysis_drivers_gasp/setup_binning_analysis_7.R index e6a3a15..dc60f29 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_gasp/setup_binning_analysis_7.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_gasp/setup_binning_analysis_7.R @@ -1,12 +1,14 @@ -args <- commandArgs(trailingOnly = TRUE) -offsite_dir <- if (is.na(args[2])) .get_config_path("LOCAL_SCEPTRE_DATA_DIR") else args[1] -processed_dir <- paste0(offsite_dir, "/data/gasperini/processed") +code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") +offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") +processed_dir <- paste0(offsite_dir, "data/gasperini/processed") +results_dir <- paste0(offsite_dir, "results/gasperini/sceptre") + # source args # Load the expression FBM to compute the gene expressions exp_mat_metadata <- readRDS(paste0(processed_dir, "/expression_FBM_metadata.rds")) ordered_gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS")) # update backing file -exp_mat_metadata$backingfile <- paste0(processed_dir, "/expression_matrix") +exp_mat_metadata$backingfile <- paste0(processed_dir, "/GSE120861_at_scale_screen.exprs") # load FBM given backing file load_fbm <- function(fbm_metadata) { out <- bigstatsr::FBM(nrow = fbm_metadata$nrow, ncol = fbm_metadata$ncol, @@ -15,12 +17,34 @@ load_fbm <- function(fbm_metadata) { return(out) } exp_mat <- load_fbm(exp_mat_metadata) +n_cells <- ncol(exp_mat) # compute the count of each gene gene_expressions <- bigstatsr::big_apply(X = exp_mat, a.FUN = function(x, ind) { colSums(x[,ind]) }, a.combine = "c") -# Finally, put the gene expression counts into a data frame +# Put the gene expression counts into a data frame gene_expressions <- data.frame(gene_id = ordered_gene_ids, expression = gene_expressions) saveRDS(gene_expressions, paste0(processed_dir, "/gene_expressions.rds")) +results <- fst::read_fst(paste0(results_dir, "/resampling_results.fst")) + +# Run the binning analysis +conduct_binning_analysis <- function(gene_expressions, results, k) { + results_cis <- results %>% dplyr::filter(site_type == "DHS") + # divide the cis genes into k groups based on expression level + cis_genes <- results_cis %>% dplyr::pull(gene_id) %>% as.character() %>% unique() + gene_expressions_cis <- gene_expressions %>% dplyr::filter(gene_id %in% cis_genes) %>% + dplyr::mutate(rank = dplyr::ntile(expression, n = k)) %>% + dplyr::mutate(m_expression = expression/n_cells) + + # for each rank, compute the fraction of pairs rejected + gene_expressions_cis %>% dplyr::group_by(rank) %>% dplyr::group_modify(.f = function(tbl, key) { + results_cis %>% dplyr::filter(gene_id %in% tbl$gene_id) %>% + dplyr::summarize(p_rejected = 100 * mean(rejected), n_genes = nrow(tbl), + m_exp = mean(tbl$m_expression), n_pairs = length(pair_id)) + }) +} + +binning_result_gasp <- conduct_binning_analysis(gene_expressions, results, 5) %>% dplyr::select(rank, n_genes, n_pairs, m_exp, p_rejected) + diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/append_simple_names_results_6_V2.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/append_simple_names_results_6.R similarity index 95% rename from sceptre_paper/analysis_drivers/analysis_drivers_xie/append_simple_names_results_6_V2.R rename to sceptre_paper/analysis_drivers/analysis_drivers_xie/append_simple_names_results_6.R index a1f8838..0f6bd4f 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/append_simple_names_results_6_V2.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/append_simple_names_results_6.R @@ -1,5 +1,5 @@ -args <- commandArgs(trailingOnly = TRUE) -code_dir <- if (is.na(args[1])) "~/research_code/sceptre-manuscript/" else args[1] +code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") +offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R")) # load data diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/bulk_validation_7.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/bulk_validation_7.R index 64be53a..74da0a7 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/bulk_validation_7.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/bulk_validation_7.R @@ -1,6 +1,6 @@ # Bulk RNA-seq -args <- commandArgs(trailingOnly = TRUE) -code_dir <- if (is.na(args[1])) "~/research_code/sceptre-manuscript/" else args[1] +code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") +offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R")) library(tidyverse) library(fst) diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/cis_enrichment_analysis_9.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/cis_enrichment_analysis_9.R index e424f61..ca49296 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/cis_enrichment_analysis_9.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/cis_enrichment_analysis_9.R @@ -1,5 +1,5 @@ -args <- commandArgs(trailingOnly = TRUE) -code_dir <- if (is.na(args[1])) "~/research_code/sceptre-manuscript/" else args[1] +code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") +offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R")) library(readxl) @@ -270,8 +270,8 @@ if (!file.exists(sprintf("%s/rejected_pairs_HIC.tsv", results_dir_enrichment))) KRnorm = read_tsv(sprintf("%s/HIC/GSE63525_K562_intrachromosomal_contact_matrices/K562/%s_resolution_intrachromosomal/%s/%s/%s_%s.KRnorm", functional_data_dir,resolution_name,chr,quality,chr,resolution_name), - col_names = "normalization", col_types = "d") %>% - pull() + col_names = "normalization", col_types = "c") %>% + pull() %>% as.numeric() rejected_pairs_chr = rejected_pairs %>% filter(chr == !!chr, !is.na(TAD_left)) %>% diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R index 4e8edb6..5f7fbc7 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R @@ -213,6 +213,8 @@ df1_neg_control <- neg.control.pair %>% mutate(gene.hgnc.id = NULL) %>% rename(" df2_cis_pairs <- select.gRNA.gene.pair %>% rename("gRNA_id" = "gRNA.id", "gene_id" = "gene.id") %>% mutate(type = "cis") bulk_regions <- readRDS(paste0(processed_dir, "/bulk_region_names.rds")) %>% filter(region_name %in% c("ARL15-enh", "MYB-enh-3")) + +# be sure to add ARL15 and MYB to the bulk region gene_ids df3_bulk_validation <- expand.grid(gene_id = gene.id, gRNA_id = bulk_regions$region) %>% mutate(type = "bulk_validation") all_pairs <- rbind(df1_neg_control, df2_cis_pairs, df3_bulk_validation) diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/setup_binning_analysis_10.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/setup_binning_analysis_10.R index 5841327..abea305 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/setup_binning_analysis_10.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/setup_binning_analysis_10.R @@ -1,13 +1,14 @@ -args <- commandArgs(trailingOnly = TRUE) -offsite_dir <- if (is.na(args[2])) .get_config_path("LOCAL_SCEPTRE_DATA_DIR") else args[1] -processed_dir <- paste0(offsite_dir, "/data/xie/processed") +code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") +offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") +source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R")) +analysis_ready_dir <- paste0(offsite_dir, "data/xie/analysis_ready") # source args # Load the expression FBM to compute the gene expressions -exp_mat_metadata <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds")) +exp_mat_metadata <- readRDS(paste0(analysis_ready_dir, "/exp_mat_sub_metadata.rds")) ordered_gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS")) # update backing file -exp_mat_metadata$backingfile <- paste0(processed_dir, "/expression_matrix") +exp_mat_metadata$backingfile <- paste0(analysis_ready_dir, "/expression_matrix_sub") # load FBM given backing file load_fbm <- function(fbm_metadata) { out <- bigstatsr::FBM(nrow = fbm_metadata$nrow, ncol = fbm_metadata$ncol, @@ -16,6 +17,7 @@ load_fbm <- function(fbm_metadata) { return(out) } exp_mat <- load_fbm(exp_mat_metadata) +n_cells <- nrow(exp_mat) # compute the count of each gene gene_expressions <- bigstatsr::big_apply(X = exp_mat, a.FUN = function(x, ind) { @@ -24,4 +26,28 @@ gene_expressions <- bigstatsr::big_apply(X = exp_mat, a.FUN = function(x, ind) { # Finally, put the gene expression counts into a data frame gene_expressions <- data.frame(gene_id = ordered_gene_ids, expression = gene_expressions) -saveRDS(gene_expressions, paste0(processed_dir, "/gene_expressions.rds")) +# saveRDS(gene_expressions, paste0(processed_dir, "/gene_expressions.rds")) + +# conduct the binning analysis with 3 bins (fewer pairs were tested) +gene_expressions <- readRDS(paste0(processed_dir, "/gene_expressions.rds")) +results <- fst::read_fst(paste0(results_dir, "/all_results_annotated.fst")) +results_inb <- fst::read_fst(paste0(results_dir_negative_binomial, "/all_results_annotated.fst")) + +conduct_binning_analysis <- function(gene_expressions, results, k) { + results_cis <- results %>% dplyr::filter(type == "cis") + # divide the cis genes into k groups based on expression level + cis_genes <- results_cis %>% dplyr::pull(gene_id) %>% as.character() %>% unique() + gene_expressions_cis <- gene_expressions %>% dplyr::filter(gene_id %in% cis_genes) %>% + dplyr::mutate(rank = dplyr::ntile(expression, n = k)) %>% + dplyr::mutate(m_expression = expression/n_cells) + + # for each rank, compute the fraction of pairs rejected + gene_expressions_cis %>% dplyr::group_by(rank) %>% dplyr::group_modify(.f = function(tbl, key) { + results_cis %>% dplyr::filter(gene_id %in% tbl$gene_id) %>% + dplyr::summarize(p_rejected = 100 * mean(rejected), n_genes = nrow(tbl), + m_exp = mean(tbl$m_expression), n_pairs = length(pair_str)) + }) +} + +binning_result_xie <- conduct_binning_analysis(gene_expressions, results, 3) +binning_result_xie_inb <- conduct_binning_analysis(gene_expressions, results_inb, 3) diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/significance_score_8.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/significance_score_8.R index 4f82b79..e63668c 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/significance_score_8.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/significance_score_8.R @@ -1,5 +1,5 @@ -args <- commandArgs(trailingOnly = TRUE) -code_dir <- if (is.na(args[1])) "~/research_code/sceptre-manuscript/" else args[1] +code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") +offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R")) suppressPackageStartupMessages(library(R.matlab)) @@ -16,8 +16,8 @@ gRNA.fname = str_replace(gRNA_id, ':', '-') for(i in 1:length(gRNA.fname)){ dest = paste0(raw_data_dir, '/', gRNA.fname[i], '-down_log-pval.mat') - download.file(url = paste0('https://github.com/russellxie/Global-analysis-K562-enhancers/blob/master/Notebooks/Data/Hypergeometric_pvals/', - gRNA.fname[i], '-down_log-pval.mat?raw=true'), + download.file(url = paste0('https://github.com/russellxie/Global-analysis-K562-enhancers/blob/master/Notebooks/Data/Hypergeometric_pvals/', + gRNA.fname[i], '-down_log-pval.mat?raw=true'), destfile = dest) } @@ -35,10 +35,10 @@ xie_pval_mat <- as.data.frame(xie_pfiles_r) colnames(xie_pval_mat) <- gRNA_id original_results_xie = do.call('rbind', lapply(1:nrow(gRNA.gene.pair), function(i){ - data.frame(gRNA_id = gRNA.gene.pair$gRNA_id[i], - gene_id = gRNA.gene.pair$gene_id[i], - raw_p_val = xie_pval_mat[as.character(gRNA.gene.pair$gene_id[i]), - as.character(gRNA.gene.pair$gRNA_id[i])], + data.frame(gRNA_id = gRNA.gene.pair$gRNA_id[i], + gene_id = gRNA.gene.pair$gene_id[i], + raw_p_val = xie_pval_mat[as.character(gRNA.gene.pair$gene_id[i]), + as.character(gRNA.gene.pair$gRNA_id[i])], site_type = gRNA.gene.pair$type[i]) })) rownames(original_results_xie) = NULL @@ -78,10 +78,10 @@ SS.down.cis = ss.down[idx_temp, ] m.gene.id = match(gRNA.gene.pair$gene_id[gRNA.gene.pair$type == 'cis'], rownames(SS.down.cis)) m.gRNA.id = match(gRNA.gene.pair$gRNA_id[gRNA.gene.pair$type == 'cis'], colnames(SS.down.cis)) -ss_xie_cis = data.frame(gene_id = gRNA.gene.pair$gene_id[gRNA.gene.pair$type == 'cis'], +ss_xie_cis = data.frame(gene_id = gRNA.gene.pair$gene_id[gRNA.gene.pair$type == 'cis'], gRNA_id = gRNA.gene.pair$gRNA_id[gRNA.gene.pair$type == 'cis'], ss.down = SS.down.cis[cbind(m.gene.id, m.gRNA.id)]) - + ss_thres =sort(ss_xie_cis$ss.down, decreasing = T)[135] ss_xie_cis$reject.down = ss_xie_cis$ss.down >= ss_thres saveRDS(ss_xie_cis, file = paste0(processed_dir, '/ss_xie_cis.rds'))