Skip to content

Commit

Permalink
add binning analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
Tim committed Sep 2, 2021
1 parent a36c43a commit 9ec9f60
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 29 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file modified sceptre_paper/.DS_Store
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
args <- commandArgs(trailingOnly = TRUE)
offsite_dir <- if (is.na(args[2])) .get_config_path("LOCAL_SCEPTRE_DATA_DIR") else args[1]
processed_dir <- paste0(offsite_dir, "/data/gasperini/processed")
code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
processed_dir <- paste0(offsite_dir, "data/gasperini/processed")
results_dir <- paste0(offsite_dir, "results/gasperini/sceptre")

# source args
# Load the expression FBM to compute the gene expressions
exp_mat_metadata <- readRDS(paste0(processed_dir, "/expression_FBM_metadata.rds"))
ordered_gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS"))
# update backing file
exp_mat_metadata$backingfile <- paste0(processed_dir, "/expression_matrix")
exp_mat_metadata$backingfile <- paste0(processed_dir, "/GSE120861_at_scale_screen.exprs")
# load FBM given backing file
load_fbm <- function(fbm_metadata) {
out <- bigstatsr::FBM(nrow = fbm_metadata$nrow, ncol = fbm_metadata$ncol,
Expand All @@ -15,12 +17,34 @@ load_fbm <- function(fbm_metadata) {
return(out)
}
exp_mat <- load_fbm(exp_mat_metadata)
n_cells <- ncol(exp_mat)

# compute the count of each gene
gene_expressions <- bigstatsr::big_apply(X = exp_mat, a.FUN = function(x, ind) {
colSums(x[,ind])
}, a.combine = "c")

# Finally, put the gene expression counts into a data frame
# Put the gene expression counts into a data frame
gene_expressions <- data.frame(gene_id = ordered_gene_ids, expression = gene_expressions)
saveRDS(gene_expressions, paste0(processed_dir, "/gene_expressions.rds"))
results <- fst::read_fst(paste0(results_dir, "/resampling_results.fst"))

# Run the binning analysis
conduct_binning_analysis <- function(gene_expressions, results, k) {
results_cis <- results %>% dplyr::filter(site_type == "DHS")
# divide the cis genes into k groups based on expression level
cis_genes <- results_cis %>% dplyr::pull(gene_id) %>% as.character() %>% unique()
gene_expressions_cis <- gene_expressions %>% dplyr::filter(gene_id %in% cis_genes) %>%
dplyr::mutate(rank = dplyr::ntile(expression, n = k)) %>%
dplyr::mutate(m_expression = expression/n_cells)

# for each rank, compute the fraction of pairs rejected
gene_expressions_cis %>% dplyr::group_by(rank) %>% dplyr::group_modify(.f = function(tbl, key) {
results_cis %>% dplyr::filter(gene_id %in% tbl$gene_id) %>%
dplyr::summarize(p_rejected = 100 * mean(rejected), n_genes = nrow(tbl),
m_exp = mean(tbl$m_expression), n_pairs = length(pair_id))
})
}

binning_result_gasp <- conduct_binning_analysis(gene_expressions, results, 5) %>% dplyr::select(rank, n_genes, n_pairs, m_exp, p_rejected)

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
args <- commandArgs(trailingOnly = TRUE)
code_dir <- if (is.na(args[1])) "~/research_code/sceptre-manuscript/" else args[1]
code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))

# load data
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Bulk RNA-seq
args <- commandArgs(trailingOnly = TRUE)
code_dir <- if (is.na(args[1])) "~/research_code/sceptre-manuscript/" else args[1]
code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))
library(tidyverse)
library(fst)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
args <- commandArgs(trailingOnly = TRUE)
code_dir <- if (is.na(args[1])) "~/research_code/sceptre-manuscript/" else args[1]
code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))

library(readxl)
Expand Down Expand Up @@ -270,8 +270,8 @@ if (!file.exists(sprintf("%s/rejected_pairs_HIC.tsv", results_dir_enrichment)))

KRnorm = read_tsv(sprintf("%s/HIC/GSE63525_K562_intrachromosomal_contact_matrices/K562/%s_resolution_intrachromosomal/%s/%s/%s_%s.KRnorm",
functional_data_dir,resolution_name,chr,quality,chr,resolution_name),
col_names = "normalization", col_types = "d") %>%
pull()
col_names = "normalization", col_types = "c") %>%
pull() %>% as.numeric()

rejected_pairs_chr = rejected_pairs %>%
filter(chr == !!chr, !is.na(TAD_left)) %>%
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ df1_neg_control <- neg.control.pair %>% mutate(gene.hgnc.id = NULL) %>% rename("
df2_cis_pairs <- select.gRNA.gene.pair %>% rename("gRNA_id" = "gRNA.id", "gene_id" = "gene.id") %>%
mutate(type = "cis")
bulk_regions <- readRDS(paste0(processed_dir, "/bulk_region_names.rds")) %>% filter(region_name %in% c("ARL15-enh", "MYB-enh-3"))

# be sure to add ARL15 and MYB to the bulk region gene_ids
df3_bulk_validation <- expand.grid(gene_id = gene.id, gRNA_id = bulk_regions$region) %>% mutate(type = "bulk_validation")

all_pairs <- rbind(df1_neg_control, df2_cis_pairs, df3_bulk_validation)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
args <- commandArgs(trailingOnly = TRUE)
offsite_dir <- if (is.na(args[2])) .get_config_path("LOCAL_SCEPTRE_DATA_DIR") else args[1]
processed_dir <- paste0(offsite_dir, "/data/xie/processed")
code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))
analysis_ready_dir <- paste0(offsite_dir, "data/xie/analysis_ready")

# source args
# Load the expression FBM to compute the gene expressions
exp_mat_metadata <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds"))
exp_mat_metadata <- readRDS(paste0(analysis_ready_dir, "/exp_mat_sub_metadata.rds"))
ordered_gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS"))
# update backing file
exp_mat_metadata$backingfile <- paste0(processed_dir, "/expression_matrix")
exp_mat_metadata$backingfile <- paste0(analysis_ready_dir, "/expression_matrix_sub")
# load FBM given backing file
load_fbm <- function(fbm_metadata) {
out <- bigstatsr::FBM(nrow = fbm_metadata$nrow, ncol = fbm_metadata$ncol,
Expand All @@ -16,6 +17,7 @@ load_fbm <- function(fbm_metadata) {
return(out)
}
exp_mat <- load_fbm(exp_mat_metadata)
n_cells <- nrow(exp_mat)

# compute the count of each gene
gene_expressions <- bigstatsr::big_apply(X = exp_mat, a.FUN = function(x, ind) {
Expand All @@ -24,4 +26,28 @@ gene_expressions <- bigstatsr::big_apply(X = exp_mat, a.FUN = function(x, ind) {

# Finally, put the gene expression counts into a data frame
gene_expressions <- data.frame(gene_id = ordered_gene_ids, expression = gene_expressions)
saveRDS(gene_expressions, paste0(processed_dir, "/gene_expressions.rds"))
# saveRDS(gene_expressions, paste0(processed_dir, "/gene_expressions.rds"))

# conduct the binning analysis with 3 bins (fewer pairs were tested)
gene_expressions <- readRDS(paste0(processed_dir, "/gene_expressions.rds"))
results <- fst::read_fst(paste0(results_dir, "/all_results_annotated.fst"))
results_inb <- fst::read_fst(paste0(results_dir_negative_binomial, "/all_results_annotated.fst"))

conduct_binning_analysis <- function(gene_expressions, results, k) {
results_cis <- results %>% dplyr::filter(type == "cis")
# divide the cis genes into k groups based on expression level
cis_genes <- results_cis %>% dplyr::pull(gene_id) %>% as.character() %>% unique()
gene_expressions_cis <- gene_expressions %>% dplyr::filter(gene_id %in% cis_genes) %>%
dplyr::mutate(rank = dplyr::ntile(expression, n = k)) %>%
dplyr::mutate(m_expression = expression/n_cells)

# for each rank, compute the fraction of pairs rejected
gene_expressions_cis %>% dplyr::group_by(rank) %>% dplyr::group_modify(.f = function(tbl, key) {
results_cis %>% dplyr::filter(gene_id %in% tbl$gene_id) %>%
dplyr::summarize(p_rejected = 100 * mean(rejected), n_genes = nrow(tbl),
m_exp = mean(tbl$m_expression), n_pairs = length(pair_str))
})
}

binning_result_xie <- conduct_binning_analysis(gene_expressions, results, 3)
binning_result_xie_inb <- conduct_binning_analysis(gene_expressions, results_inb, 3)
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
args <- commandArgs(trailingOnly = TRUE)
code_dir <- if (is.na(args[1])) "~/research_code/sceptre-manuscript/" else args[1]
code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))

suppressPackageStartupMessages(library(R.matlab))
Expand All @@ -16,8 +16,8 @@ gRNA.fname = str_replace(gRNA_id, ':', '-')

for(i in 1:length(gRNA.fname)){
dest = paste0(raw_data_dir, '/', gRNA.fname[i], '-down_log-pval.mat')
download.file(url = paste0('https://github.com/russellxie/Global-analysis-K562-enhancers/blob/master/Notebooks/Data/Hypergeometric_pvals/',
gRNA.fname[i], '-down_log-pval.mat?raw=true'),
download.file(url = paste0('https://github.com/russellxie/Global-analysis-K562-enhancers/blob/master/Notebooks/Data/Hypergeometric_pvals/',
gRNA.fname[i], '-down_log-pval.mat?raw=true'),
destfile = dest)
}

Expand All @@ -35,10 +35,10 @@ xie_pval_mat <- as.data.frame(xie_pfiles_r)
colnames(xie_pval_mat) <- gRNA_id

original_results_xie = do.call('rbind', lapply(1:nrow(gRNA.gene.pair), function(i){
data.frame(gRNA_id = gRNA.gene.pair$gRNA_id[i],
gene_id = gRNA.gene.pair$gene_id[i],
raw_p_val = xie_pval_mat[as.character(gRNA.gene.pair$gene_id[i]),
as.character(gRNA.gene.pair$gRNA_id[i])],
data.frame(gRNA_id = gRNA.gene.pair$gRNA_id[i],
gene_id = gRNA.gene.pair$gene_id[i],
raw_p_val = xie_pval_mat[as.character(gRNA.gene.pair$gene_id[i]),
as.character(gRNA.gene.pair$gRNA_id[i])],
site_type = gRNA.gene.pair$type[i])
}))
rownames(original_results_xie) = NULL
Expand Down Expand Up @@ -78,10 +78,10 @@ SS.down.cis = ss.down[idx_temp, ]

m.gene.id = match(gRNA.gene.pair$gene_id[gRNA.gene.pair$type == 'cis'], rownames(SS.down.cis))
m.gRNA.id = match(gRNA.gene.pair$gRNA_id[gRNA.gene.pair$type == 'cis'], colnames(SS.down.cis))
ss_xie_cis = data.frame(gene_id = gRNA.gene.pair$gene_id[gRNA.gene.pair$type == 'cis'],
ss_xie_cis = data.frame(gene_id = gRNA.gene.pair$gene_id[gRNA.gene.pair$type == 'cis'],
gRNA_id = gRNA.gene.pair$gRNA_id[gRNA.gene.pair$type == 'cis'],
ss.down = SS.down.cis[cbind(m.gene.id, m.gRNA.id)])

ss_thres =sort(ss_xie_cis$ss.down, decreasing = T)[135]
ss_xie_cis$reject.down = ss_xie_cis$ss.down >= ss_thres
saveRDS(ss_xie_cis, file = paste0(processed_dir, '/ss_xie_cis.rds'))

0 comments on commit 9ec9f60

Please sign in to comment.