From 4ec59506b65b989a029c315153acd212fd2d5015 Mon Sep 17 00:00:00 2001 From: Tim Date: Tue, 7 Sep 2021 22:45:01 -0400 Subject: [PATCH] use ondisc for preprocessing --- .../analysis_drivers_xie/pre_process_data_3.R | 88 ++----- .../pre_process_data_3_old.R | 242 ++++++++++++++++++ .../analysis_drivers_xie/quality_control_4.R | 75 +++--- .../quality_control_4_old.R | 78 ++++++ .../select_gRNA_gene_pair_5.R | 11 +- 5 files changed, 382 insertions(+), 112 deletions(-) create mode 100644 sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3_old.R create mode 100644 sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4_old.R diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3.R index ae6be3a..974bce2 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3.R @@ -1,14 +1,10 @@ code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") - -# Pre-process data source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R")) -packs <- c("bigstatsr", "future", "furrr", "Matrix", "rhdf5", "stringr", "openxlsx", "katsevich2020", "magrittr") -for (pack in packs) suppressPackageStartupMessages(library(pack, character.only = TRUE)) +library(ondisc) +library(rhdf5) +library(openxlsx) -################### -# Expression matrix -################### # First, we determine the number of cells and number of genes across all the batches h5_files <- paste0(raw_data_dir, "/", grep(pattern = '*.h5', list.files(raw_data_dir), value = TRUE)) dims_across_h5s <- sapply(h5_files, function(h5_file) { @@ -22,71 +18,27 @@ row.names(dims_across_h5s) <- NULL all(dims_across_h5s[,1] == dims_across_h5s[1,1]) # Verify n genes consistent across files n_cells_total <- sum(dims_across_h5s[,2]) n_genes_total <- dims_across_h5s[1,1] +batch_lane <- stringr::str_extract(string = h5_files, pattern = "Batch-[0-9]_[0-9]") %>% + gsub(pattern = "Batch-", replacement = "") + +# Next, initialize an ondisc matrix storing the gene expression data +if (!dir.exists(paste0(processed_dir, "/odm"))) dir.create(paste0(processed_dir, "/odm")) +h5_odm <- create_ondisc_matrix_from_h5_list(h5_list = h5_files, + odm_fp = paste0(processed_dir, "/odm/gene_expression_matrix"), + metadata_fp = paste0(processed_dir, "/odm/gene_expression_metadata.rds"), + barcode_suffixes = batch_lane, + progress = TRUE) -# We will use only the protein-coding genes in this analysis; load them. +# modify the ondisc matrix by identifying the protein-coding genes gene_df <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/Genes.xlsx"), sheet = 1) all_protein_coding_genes <- gene_df$Gene_Symbol rm(gene_df) -h5_file <- h5_files[1] -h5_handle <- H5Fopen(h5_file) -all_sequenced_genes <- h5_handle$"/refgenome_hg38_CROP-Guide-MS2-2.1.0/gene_names" -all_sequenced_genes_ids <- h5_handle$"/refgenome_hg38_CROP-Guide-MS2-2.1.0/genes" -saveRDS(all_sequenced_genes_ids, file = paste0(processed_dir, '/all_sequenced_genes_id.rds')) - -h5_gene_info <- data.frame(row_idx = seq(1, n_genes_total), gene_name = all_sequenced_genes, gene_id = all_sequenced_genes_ids) -h5_gene_info_protein_coding <- dplyr::filter(h5_gene_info, - gene_name %in% all_protein_coding_genes) -genes_in_use <- unique(h5_gene_info_protein_coding$gene_name) -genes_in_use_ids <- h5_gene_info_protein_coding$gene_id -gene_idxs_in_use <- h5_gene_info_protein_coding$row_idx -n_genes_in_use <- length(genes_in_use_ids) -H5Fclose(h5_handle) - -# Next, we create a file-backed matrix to store the transpose of the expression matrix -exp_mat_t <- FBM(nrow = n_genes_in_use, ncol = n_cells_total, type = "unsigned short", init = 0, backingfile = paste0(processed_dir, "/expression_matrix_t"), create_bk = TRUE) -exp_mat_t_metadata <- list(nrow = n_genes_in_use, ncol = n_cells_total, type = "unsigned short", backingfile = paste0(processed_dir, "/expression_matrix_t")) -saveRDS(object = exp_mat_t_metadata, file = paste0(processed_dir, "/exp_mat_t_metadata.rds")) - -exp_mat <- FBM(nrow = n_cells_total, ncol = n_genes_in_use, type = "unsigned short", init = 0, backingfile = paste0(processed_dir, "/expression_matrix"), create_bk = TRUE) -exp_mat_metadata <- list(nrow = n_cells_total, ncol = n_genes_in_use, type = "unsigned short", backingfile = paste0(processed_dir, "/expression_matrix")) -saveRDS(object = exp_mat_metadata, file = paste0(processed_dir, "/exp_mat_metadata.rds")) - -# We iterate through the hd5 files and write the column chunks piece-by-piece to the FBM. -n_cells_processed <- 0 -ordered_cell_barcodes <- character(0) - -batch_lane <- stringr::str_extract(string = h5_files, pattern = "Batch-[0-9]_[0-9]") %>% - gsub(pattern = "Batch-", replacement = "") -for (i in seq(1, length(h5_files))) { - print(i) - curr_batch_lane <- batch_lane[i] - h5_file <- h5_files[i] - print(paste("Working on", h5_file)) - h5_handle <- H5Fopen(h5_file) - dat <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/data" - indices <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/indices" - ind_ptr <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/indptr" - shape <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/shape" - n_cols <- shape[2] - cell_barcodes <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/barcodes" - # do some barcode processing - if (any(duplicated(cell_barcodes))) stop("Duplicate barcodes") - cell_barcodes <- paste0(cell_barcodes, "_", curr_batch_lane) - # Ensure that the maximum index pointer is less than the maximum integer value in R - if (max(ind_ptr) >= 2147483647) stop("Index pointer exceeds R max integer value.") - sparseM <- Matrix::sparseMatrix(i = indices, p = ind_ptr, x = dat, dims = shape) - sparseM <- sparseM[gene_idxs_in_use,] - # write - exp_mat_t[seq(1, n_genes_in_use), (n_cells_processed + 1):(n_cells_processed + n_cols)] <- as.matrix(sparseM) - exp_mat[(n_cells_processed + 1):(n_cells_processed + n_cols), seq(1, n_genes_in_use)] <- as.matrix(t(sparseM)) - H5Fclose(h5_handle) - n_cells_processed <- n_cells_processed + n_cols - ordered_cell_barcodes <- c(ordered_cell_barcodes, cell_barcodes) -} - -saveRDS(object = ordered_cell_barcodes, file = paste0(processed_dir, "/cell_barcodes_gene.rds")) -saveRDS(object = genes_in_use, file = paste0(processed_dir, "/ordered_genes.RDS")) -saveRDS(object = genes_in_use_ids, file = paste0(processed_dir, "/ordered_gene_ids.RDS")) +is_protein_coding <- get_feature_names(h5_odm) %in% all_protein_coding_genes +h5_odm_w_protein_coding <- h5_odm %>% mutate_feature_covariates(protein_coding = is_protein_coding) +save_odm(h5_odm_w_protein_coding, paste0(processed_dir, "/odm/metadata_plus.rds")) +all_gene_ids <- rhdf5::h5read(file = h5_files[1], name = "/refgenome_hg38_CROP-Guide-MS2-2.1.0/genes") +saveRDS(object = all_gene_ids, + file = paste0(processed_dir, '/all_sequenced_genes_id.rds')) ############### # gRNA UMI data diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3_old.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3_old.R new file mode 100644 index 0000000..ae6be3a --- /dev/null +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3_old.R @@ -0,0 +1,242 @@ +code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") +offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") + +# Pre-process data +source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R")) +packs <- c("bigstatsr", "future", "furrr", "Matrix", "rhdf5", "stringr", "openxlsx", "katsevich2020", "magrittr") +for (pack in packs) suppressPackageStartupMessages(library(pack, character.only = TRUE)) + +################### +# Expression matrix +################### +# First, we determine the number of cells and number of genes across all the batches +h5_files <- paste0(raw_data_dir, "/", grep(pattern = '*.h5', list.files(raw_data_dir), value = TRUE)) +dims_across_h5s <- sapply(h5_files, function(h5_file) { + h5_handle <- H5Fopen(h5_file) + dim <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/shape" + out <- as.integer(dim) + H5Fclose(h5_handle) + return(out) +}) %>% t() +row.names(dims_across_h5s) <- NULL +all(dims_across_h5s[,1] == dims_across_h5s[1,1]) # Verify n genes consistent across files +n_cells_total <- sum(dims_across_h5s[,2]) +n_genes_total <- dims_across_h5s[1,1] + +# We will use only the protein-coding genes in this analysis; load them. +gene_df <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/Genes.xlsx"), sheet = 1) +all_protein_coding_genes <- gene_df$Gene_Symbol +rm(gene_df) +h5_file <- h5_files[1] +h5_handle <- H5Fopen(h5_file) +all_sequenced_genes <- h5_handle$"/refgenome_hg38_CROP-Guide-MS2-2.1.0/gene_names" +all_sequenced_genes_ids <- h5_handle$"/refgenome_hg38_CROP-Guide-MS2-2.1.0/genes" +saveRDS(all_sequenced_genes_ids, file = paste0(processed_dir, '/all_sequenced_genes_id.rds')) + +h5_gene_info <- data.frame(row_idx = seq(1, n_genes_total), gene_name = all_sequenced_genes, gene_id = all_sequenced_genes_ids) +h5_gene_info_protein_coding <- dplyr::filter(h5_gene_info, + gene_name %in% all_protein_coding_genes) +genes_in_use <- unique(h5_gene_info_protein_coding$gene_name) +genes_in_use_ids <- h5_gene_info_protein_coding$gene_id +gene_idxs_in_use <- h5_gene_info_protein_coding$row_idx +n_genes_in_use <- length(genes_in_use_ids) +H5Fclose(h5_handle) + +# Next, we create a file-backed matrix to store the transpose of the expression matrix +exp_mat_t <- FBM(nrow = n_genes_in_use, ncol = n_cells_total, type = "unsigned short", init = 0, backingfile = paste0(processed_dir, "/expression_matrix_t"), create_bk = TRUE) +exp_mat_t_metadata <- list(nrow = n_genes_in_use, ncol = n_cells_total, type = "unsigned short", backingfile = paste0(processed_dir, "/expression_matrix_t")) +saveRDS(object = exp_mat_t_metadata, file = paste0(processed_dir, "/exp_mat_t_metadata.rds")) + +exp_mat <- FBM(nrow = n_cells_total, ncol = n_genes_in_use, type = "unsigned short", init = 0, backingfile = paste0(processed_dir, "/expression_matrix"), create_bk = TRUE) +exp_mat_metadata <- list(nrow = n_cells_total, ncol = n_genes_in_use, type = "unsigned short", backingfile = paste0(processed_dir, "/expression_matrix")) +saveRDS(object = exp_mat_metadata, file = paste0(processed_dir, "/exp_mat_metadata.rds")) + +# We iterate through the hd5 files and write the column chunks piece-by-piece to the FBM. +n_cells_processed <- 0 +ordered_cell_barcodes <- character(0) + +batch_lane <- stringr::str_extract(string = h5_files, pattern = "Batch-[0-9]_[0-9]") %>% + gsub(pattern = "Batch-", replacement = "") +for (i in seq(1, length(h5_files))) { + print(i) + curr_batch_lane <- batch_lane[i] + h5_file <- h5_files[i] + print(paste("Working on", h5_file)) + h5_handle <- H5Fopen(h5_file) + dat <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/data" + indices <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/indices" + ind_ptr <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/indptr" + shape <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/shape" + n_cols <- shape[2] + cell_barcodes <- h5_handle$"refgenome_hg38_CROP-Guide-MS2-2.1.0/barcodes" + # do some barcode processing + if (any(duplicated(cell_barcodes))) stop("Duplicate barcodes") + cell_barcodes <- paste0(cell_barcodes, "_", curr_batch_lane) + # Ensure that the maximum index pointer is less than the maximum integer value in R + if (max(ind_ptr) >= 2147483647) stop("Index pointer exceeds R max integer value.") + sparseM <- Matrix::sparseMatrix(i = indices, p = ind_ptr, x = dat, dims = shape) + sparseM <- sparseM[gene_idxs_in_use,] + # write + exp_mat_t[seq(1, n_genes_in_use), (n_cells_processed + 1):(n_cells_processed + n_cols)] <- as.matrix(sparseM) + exp_mat[(n_cells_processed + 1):(n_cells_processed + n_cols), seq(1, n_genes_in_use)] <- as.matrix(t(sparseM)) + H5Fclose(h5_handle) + n_cells_processed <- n_cells_processed + n_cols + ordered_cell_barcodes <- c(ordered_cell_barcodes, cell_barcodes) +} + +saveRDS(object = ordered_cell_barcodes, file = paste0(processed_dir, "/cell_barcodes_gene.rds")) +saveRDS(object = genes_in_use, file = paste0(processed_dir, "/ordered_genes.RDS")) +saveRDS(object = genes_in_use_ids, file = paste0(processed_dir, "/ordered_gene_ids.RDS")) + +############### +# gRNA UMI data +############### +rm(list=ls()) +code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") +offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") +source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R")) + +# Save the bulk region names +h5_files <- paste0(raw_data_dir, "/", grep(pattern = '*.h5', list.files(raw_data_dir), value = TRUE)) +gene_names <- rhdf5::h5read(h5_files[1], "/refgenome_hg38_CROP-Guide-MS2-2.1.0/gene_names") +gene_ids <- rhdf5::h5read(h5_files[1], "/refgenome_hg38_CROP-Guide-MS2-2.1.0/genes") + +enh_targets_df <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/enh_targets.xlsx"), sheet = 1) +bulk_region_names <- filter(enh_targets_df, gene_names %in% c("ARL15", "MYB")) %>% + dplyr::select(region, region_name = Denoted.Region.Name.used.in.the.paper, targeted_gene = gene_names) %>% + filter(region_name %in% c("ARL15-enh", "MYB-enh-3")) %>% + dplyr::mutate(targeted_gene_id = c(gene_ids["ARL15" == gene_names], + gene_ids["MYB" == gene_names])) +saveRDS(object = bulk_region_names, paste0(processed_dir, "/bulk_region_names.rds")) + + +#################################################### +# Next, we create a data frame containing UMI counts +#################################################### +guide_seqs <- openxlsx::read.xlsx(xlsxFile = paste0(raw_data_dir, "/all_oligos.xlsx"), + sheet = 1) %>% dplyr::select(hg38_enh_region = "region.pos.(hg38)", + spacer_seq = "spacer.sequence") +weird_spacers <- names(which(table(guide_seqs$spacer_seq) >= 2)) +guide_seqs <- dplyr::filter(guide_seqs, !(spacer_seq %in% weird_spacers)) +all(table(guide_seqs$spacer_seq) == 1); all(table(guide_seqs$hg38_enh_region) == 10) + +# get the raw file names +raw_fs <- list.files(raw_data_dir) +gRNA_files <- paste0(raw_data_dir, "/", grep(pattern = "sgRNA-enrichment_5K-sgRNAs_Batch", x = raw_fs, value = TRUE)) + +# get the batch ID of each file +batch <- stringr::str_extract(string = gRNA_files, pattern = "Batch_[0-9]_[0-9]") %>% + gsub(pattern = "Batch_", replacement = "") + +# construct a sparse matrix of gRNA counts for each file +res <- lapply(X = seq(1L, length(gRNA_files)), FUN = function(i) { + curr_file <- gRNA_files[i] + curr_batch <- batch[i] + print(paste("Working on file", curr_file)) + curr_gRNA_count_matrix <- readr::read_tsv(file = curr_file, + col_names = c("cell_barcode", "total_read_count", "total_umi_count", "gRNA_spacer_seqs", "read_counts", "umi_counts"), + col_types = c("cccccc")) %>% dplyr::select(cell_barcode, gRNA_spacer_seqs, umi_counts, total_umi_count) + cell_barcodes <- curr_gRNA_count_matrix$cell_barcode + # convert to sparse triplet matrix + umi_count_list <- lapply(curr_gRNA_count_matrix$umi_counts, function(v) stringr::str_split(v, pattern = ";") %>% unlist() %>% as.integer()) + gRNA_spacer_seq_list <- lapply(curr_gRNA_count_matrix$gRNA_spacer_seqs, function(v) stringr::str_split(v, pattern = ";") %>% unlist()) + n_entries_per_cell <- sapply(X = umi_count_list, function(v) length(v)) + barcode_vector <- rep(x = cell_barcodes, times = n_entries_per_cell) + gRNA_spacer_seq_vector <- unlist(gRNA_spacer_seq_list) + umi_count_vector <- unlist(umi_count_list) + ch_df <- data.frame(barcode = barcode_vector, + spacer_seq = gRNA_spacer_seq_vector, + umi_count = umi_count_vector) + # remove all entries with spacer seqs not in the spacer seq list + ch_df <- ch_df %>% dplyr::filter(gRNA_spacer_seq_vector %in% guide_seqs$spacer_seq) + ordered_barcode_vector <- sort(unique(ch_df$barcode)) + + spacer_seq_idx <- match(x = ch_df$spacer_seq, guide_seqs$spacer_seq) + barcode_idx <- match(x = ch_df$barcode, table = ordered_barcode_vector) + + m <- Matrix::sparseMatrix(i = spacer_seq_idx, + j = barcode_idx, + x = ch_df$umi_count, + dims = c(length(guide_seqs$spacer_seq), + length(ordered_barcode_vector))) + rownames(m) <- guide_seqs$spacer_seq + colnames(m) <- paste0(ordered_barcode_vector, "-1_", curr_batch) + # obtain the gRNA UMI counts and cell count + curr_gRNA_count_matrix <- as.data.frame(curr_gRNA_count_matrix) + rownames(curr_gRNA_count_matrix) <- curr_gRNA_count_matrix$cell_barcode + curr_gRNA_count_matrix_sub <- curr_gRNA_count_matrix[ordered_barcode_vector,] + umi_counts <- as.integer(curr_gRNA_count_matrix_sub$total_umi_count) + n_cells <- length(umi_counts) + return(list(m = m, umi_counts = umi_counts, n_cells = n_cells)) +}) + +combined_m <- do.call(what = cbind, args = lapply(res, function(i) i$m)) +all(Matrix::colSums(combined_m) >= 1); all(Matrix::rowSums(combined_m) >= 1) + +# Finally, perform the grouping operation +gRNA_matrix_raw <- combined_m +unique_regions <- unique(guide_seqs$hg38_enh_region) + +thresholded_ungrouped_mat <- apply(X = gRNA_matrix_raw, MARGIN = 1, FUN = function(r) { + r_nonzero <- r[r >= 1] + as.integer(r > mean(r_nonzero)) +}) %>% Matrix::t() +colnames(thresholded_ungrouped_mat) <- colnames(gRNA_matrix_raw) + +thresholded_grouped_mat <- sapply(X = unique_regions, FUN = function(curr_region) { + print(curr_region) + curr_spacer_seqs <- dplyr::filter(guide_seqs, hg38_enh_region == curr_region) %>% + dplyr::pull(spacer_seq) + curr_m <- thresholded_ungrouped_mat[curr_spacer_seqs,] + apply(X = curr_m, MARGIN = 2, FUN = function(col) max(col)) +}) %>% t() + +saveRDS(thresholded_grouped_mat, paste0(processed_dir, "/gRNA_indicator_matrix_unordered.rds")) + + +# compute the cell covariate matrix +cell_gRNA_umi_counts <- lapply(res, function(i) i$umi_counts) %>% do.call(what = "c", args = .) +cell_counts <- lapply(res, function(i) i$n_cells) %>% unlist() +cell_covariate_matrix <- data.frame(cell_barcode = colnames(thresholded_grouped_mat), + cell_gRNA_umi_counts = cell_gRNA_umi_counts, + batch = rep(batch, times = cell_counts)) +saveRDS(object = cell_covariate_matrix, file = paste0(processed_dir, "/cell_covariate_matrix_gRNA.rds")) + +############## +# Bulk RNA-seq +############## +library(openxlsx) +rm(list=ls()) +code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") +offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") +source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R")) + +gene_df <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/Genes.xlsx"), sheet = 1) +all_protein_coding_genes <- gene_df$Gene_Symbol +bulk_info <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/bulk_rna_info.xlsx"), sheet = 3) %>% select(library_name = Library.Name, gRNA = sgRNA, region = Region, biological_duplicate = Biological.Duplicate) +bulk_info_arl15_enh <- slice(bulk_info, 1:25) +bulk_info_myb_enh3 <- slice(bulk_info, 26:49) + +bulk_df <- suppressWarnings(read_tsv(file = paste0(raw_data_dir, "/GSE129825_Libraries.FeatureCounts.ARL15_enhancer.txt"), col_types = "ccccciiiiiiiiiiiiiiiiiiiiiiiiii")) %>% rename("PZ788" = "PZ778...10", "PZ778" = "PZ778...13") +bulk_df_arl15_enh <- filter(bulk_df, Geneid %in% all_protein_coding_genes) +bulk_df <- suppressWarnings(read_tsv(file = paste0(raw_data_dir, "/GSE129825_Libraries.FeatureCounts.MYB_enhancer.txt"), col_types = "ccccciiiiiiiiiiiiiiiiiiiiiiiiii")) +bulk_df_myb_enh3 <- filter(bulk_df, Geneid %in% all_protein_coding_genes) + +bulk_rnaseq <- list(data = list(arl15_enh = bulk_df_arl15_enh, myb_enh3 = bulk_df_myb_enh3), info = list(arl15_enh = bulk_info_arl15_enh, myb_enh3 = bulk_info_myb_enh3)) +saveRDS(object = bulk_rnaseq, file = paste0(processed_dir, "/bulk_RNAseq.rds")) + +################################################# +# Xie hypergeometric p-values for ARL15 and MYB-3 +################################################# +all_sequenced_genes_ids <- readRDS(file = paste0(processed_dir, '/all_sequenced_genes_id.rds')) +suppressPackageStartupMessages(library(R.matlab)) +extract_p_vals_hypergeo <- function(p_vals_hypergeo) { + p_vals <- exp(p_vals_hypergeo$matrix[1,]) + names(p_vals) <- all_sequenced_genes_ids + return(p_vals) +} + +xie_pfiles <- setNames(paste0(raw_data_dir, c("/hypergeometric_pvals_arl15_down.mat", "/hypergeometric_pvals_myb3_down.mat")), c("arl15_enh", "myb_enh3")) +xie_pfiles_r <- xie_pfiles %>% map(readMat) %>% map(extract_p_vals_hypergeo) + +saveRDS(object = xie_pfiles_r, file = paste0(processed_dir, c("/xie_p_values.rds"))) diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R index 0f25052..9a082bb 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R @@ -1,54 +1,53 @@ code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R")) +library(ondisc) -################################################# -# 1. Load gene data; create gene covariate matrix -################################################# -barcodes_gene <- readRDS(paste0(processed_dir, "/cell_barcodes_gene.rds")) - -# Compute the number of UMIs in each cell -exp_mat_metadata_t <- readRDS(paste0(processed_dir, "/exp_mat_t_metadata.rds")) -exp_mat_metadata_t$backingfile <- paste0(processed_dir, "/expression_matrix_t") -exp_mat_t <- exp_mat_metadata_t %>% sceptre::load_fbm() - -n_umis_per_cell <- big_apply(exp_mat_t, function(X, ind) {colSums(X[,ind])}) %>% unlist() -gene_covariate_matrix <- data.frame(cell_barcode = barcodes_gene, log_n_umis = n_umis_per_cell) -gene_barcode_original_order <- gene_covariate_matrix$cell_barcode +############## +# 1. Gene data +############## +odm_dir <- paste0(processed_dir, "/odm") +odm_fp <- paste0(odm_dir, "/gene_expression_matrix.odm") +metadata_fp <- paste0(odm_dir, "/metadata_plus.rds") +gene_odm <- read_odm(odm_fp = odm_fp, metadata_fp = metadata_fp) # Compute the mean expression of each gene -gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS")) -exp_mat_metadata <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds")) -exp_mat_metadata$backingfile <- paste0(processed_dir, "/expression_matrix") -exp_mat <- load_fbm(exp_mat_metadata) -gene_expression_p <- big_apply(exp_mat, function(X, ind) colMeans(X[,ind] >= 1)) %>% unlist() -highly_expressed_genes <- gene_ids[which(gene_expression_p >= 0.005)] # threshold of 0.005 (1/2%) +highly_expressed_genes <- get_feature_covariates(gene_odm) %>% dplyr::mutate(gene_id = get_feature_ids(gene_odm), + p_expressed = n_nonzero / ncol(gene_odm)) %>% + dplyr::filter(p_expressed >= 0.005, protein_coding) %>% dplyr::pull(gene_id) -########################################## -# Load gRNA data and gRNA covariate matrix -########################################## +# Save the highly expressed genes; subset expression matrix according to these genes +saveRDS(object = highly_expressed_genes, file = paste0(processed_dir, "/highly_expressed_genes.rds")) +gene_odm <- gene_odm[highly_expressed_genes,] # 10,014 genes, 106,670 cells + +############## +# 2. gRNA data +############## gRNA_indic_matrix <- readRDS(paste0(processed_dir, "/gRNA_indicator_matrix_unordered.rds")) gRNA_covariate_matrix <- readRDS(paste0(processed_dir, "/cell_covariate_matrix_gRNA.rds")) rownames(gRNA_covariate_matrix) <- gRNA_covariate_matrix$cell_barcode -# Intersect the cells -cells_intersect <- intersect(x = gene_barcode_original_order, gRNA_covariate_matrix$cell_barcode) +###################### +# 3. Cell intersection +###################### +# Intersect the gRNA and gene cells +cells_intersect <- intersect(x = get_cell_barcodes(gene_odm), + y = colnames(gRNA_indic_matrix)) # subset the gRNA indicator matrix and covariate matrix appropriately gRNA_indic_matrix_sub <- as.data.frame(gRNA_indic_matrix[,cells_intersect] %>% t()) gRNA_covariate_matrix_sub <- gRNA_covariate_matrix[cells_intersect,] -# get the cell subset ordering -cell_subset <- match(cells_intersect, gene_barcode_original_order) - -################################ -# get the global covariate matrix -################################ -global_covariate_matrix <- data.frame(log_n_umis = log(n_umis_per_cell[cell_subset]), +# subset the gene matrix +gene_odm <- gene_odm[,cells_intersect] +# obtain the global cell covariate matrix +global_covariate_matrix <- data.frame(log_n_umis = get_cell_covariates(gene_odm) %>% dplyr::pull(n_umis) %>% log(), log_n_gRNA_umis = log(gRNA_covariate_matrix_sub$cell_gRNA_umi_counts), batch = gRNA_covariate_matrix_sub$batch) -# save +################################################################################## +# Create analysis-ready dir; save highly expressed genes and gRNA indicator matrix +################################################################################## # create a new directory, "analysis_ready" to store all data in analysis-ready format. analysis_ready_dir <- paste0(offsite_dir, "data/xie/analysis_ready") if (!dir.exists(analysis_ready_dir)) dir.create(path = analysis_ready_dir, recursive = TRUE) @@ -56,14 +55,12 @@ if (!dir.exists(analysis_ready_dir)) dir.create(path = analysis_ready_dir, recur fst::write_fst(x = global_covariate_matrix, path = paste0(analysis_ready_dir, "/covariate_model_matrix.fst")) saveRDS(object = highly_expressed_genes, file = paste0(processed_dir, "/highly_expressed_genes.rds")) -gRNA_indic_mat <- fst::write_fst(gRNA_indic_matrix_sub, paste0(analysis_ready_dir, "/gRNA_indicator_matrix.fst")) +fst::write_fst(gRNA_indic_matrix_sub, paste0(analysis_ready_dir, "/gRNA_indicator_matrix.fst")) -############################################### -# save subsetted cell-by-gene expression matrix -############################################### -gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS")) -exp_mat_mem <- exp_mat[,seq(1, ncol(exp_mat))] -exp_mat_sub <- exp_mat_mem[cell_subset,] +###################################################################################### +# save subsetted cell-by-gene expression matrix (only highly expressed genes included) +###################################################################################### +exp_mat_sub <- as.matrix(gene_odm[[,1:ncol(gene_odm)]]) exp_mat_sub_disk <- FBM(nrow = nrow(exp_mat_sub), ncol = ncol(exp_mat_sub), type = "unsigned short", diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4_old.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4_old.R new file mode 100644 index 0000000..4e041c4 --- /dev/null +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4_old.R @@ -0,0 +1,78 @@ +code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript") +offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR") +source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R")) + +################################################# +# 1. Load gene data; create gene covariate matrix +################################################# +barcodes_gene <- readRDS(paste0(processed_dir, "/cell_barcodes_gene.rds")) + +# Compute the number of UMIs in each cell +exp_mat_metadata_t <- readRDS(paste0(processed_dir, "/exp_mat_t_metadata.rds")) +exp_mat_metadata_t$backingfile <- paste0(processed_dir, "/expression_matrix_t") +exp_mat_t <- exp_mat_metadata_t %>% sceptre::load_fbm() + +n_umis_per_cell <- big_apply(exp_mat_t, function(X, ind) {colSums(X[,ind])}) %>% unlist() +gene_covariate_matrix <- data.frame(cell_barcode = barcodes_gene, log_n_umis = n_umis_per_cell) +gene_barcode_original_order <- gene_covariate_matrix$cell_barcode + +# Compute the mean expression of each gene +gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS")) +exp_mat_metadata <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds")) +exp_mat_metadata$backingfile <- paste0(processed_dir, "/expression_matrix") +exp_mat <- load_fbm(exp_mat_metadata) +gene_expression_p <- big_apply(exp_mat, function(X, ind) colMeans(X[,ind] >= 1)) %>% unlist() +highly_expressed_genes <- gene_ids[which(gene_expression_p >= 0.005)] # threshold of 0.005 (1/2%) + +########################################## +# Load gRNA data and gRNA covariate matrix +########################################## +gRNA_indic_matrix <- readRDS(paste0(processed_dir, "/gRNA_indicator_matrix_unordered.rds")) +gRNA_covariate_matrix <- readRDS(paste0(processed_dir, "/cell_covariate_matrix_gRNA.rds")) +rownames(gRNA_covariate_matrix) <- gRNA_covariate_matrix$cell_barcode + +# Intersect the cells +cells_intersect <- intersect(x = gene_barcode_original_order, gRNA_covariate_matrix$cell_barcode) + +# subset the gRNA indicator matrix and covariate matrix appropriately +gRNA_indic_matrix_sub <- as.data.frame(gRNA_indic_matrix[,cells_intersect] %>% t()) +gRNA_covariate_matrix_sub <- gRNA_covariate_matrix[cells_intersect,] + +# get the cell subset ordering +cell_subset <- match(cells_intersect, gene_barcode_original_order) + +################################ +# get the global covariate matrix +################################ +global_covariate_matrix <- data.frame(log_n_umis = log(n_umis_per_cell[cell_subset]), + log_n_gRNA_umis = log(gRNA_covariate_matrix_sub$cell_gRNA_umi_counts), + batch = gRNA_covariate_matrix_sub$batch) + +# save +# create a new directory, "analysis_ready" to store all data in analysis-ready format. +analysis_ready_dir <- paste0(offsite_dir, "data/xie/analysis_ready") +if (!dir.exists(analysis_ready_dir)) dir.create(path = analysis_ready_dir, recursive = TRUE) + +fst::write_fst(x = global_covariate_matrix, + path = paste0(analysis_ready_dir, "/covariate_model_matrix.fst")) +saveRDS(object = highly_expressed_genes, file = paste0(processed_dir, "/highly_expressed_genes.rds")) +gRNA_indic_mat <- fst::write_fst(gRNA_indic_matrix_sub, paste0(analysis_ready_dir, "/gRNA_indicator_matrix.fst")) + +###################################################################################### +# save subsetted cell-by-gene expression matrix (only highly expressed genes included) +###################################################################################### +gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS")) +exp_mat_mem <- exp_mat[,seq(1, ncol(exp_mat))] +exp_mat_sub <- exp_mat_mem[cell_subset,] +exp_mat_sub_disk <- FBM(nrow = nrow(exp_mat_sub), + ncol = ncol(exp_mat_sub), + type = "unsigned short", + init = 0, + backingfile = paste0(analysis_ready_dir, "/expression_matrix_sub"), + create_bk = TRUE) +exp_mat_sub_disk[1:nrow(exp_mat_sub), 1:ncol(exp_mat_sub)] <- exp_mat_sub +exp_mat_sub_metadata <- list(nrow = nrow(exp_mat_sub), + ncol = ncol(exp_mat_sub), + type = "unsigned short", + backingfile = paste0(analysis_ready_dir, "/expression_matrix_sub")) +saveRDS(object = exp_mat_sub_metadata, paste0(analysis_ready_dir, "/exp_mat_sub_metadata.rds")) diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R index 0eff249..3e2c4da 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R @@ -13,7 +13,7 @@ library(biomaRt) ##################### # 1. gene positions ##################### -gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds')) # 5129 +gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds')) # 10614 gene.ensembl.id <- lapply(strsplit(gene.id, '[.]'), function(x){x[1]}) %>% unlist() ensembl <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl") ensembl.37 <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", GRCh = 37) @@ -68,21 +68,22 @@ chr.select = intersect(gRNA.mart$chr, gene.mart$chr) select.gRNA.gene.pair = NULL -for(chr in chr.select){ +for (chr in chr.select) { gene.chr.id = which(gene.mart$chr == chr) gRNA.chr.id = which(gRNA.mart$chr == chr) gene.pos.start = gene.mart$start_position[gene.chr.id] gene.pos.end = gene.mart$end_position[gene.chr.id] gene.tss = gene.pos.start - gene.tss[gene.mart$strand == -1] = gene.pos.end[gene.mart$strand == -1] + # gene.tss[gene.mart$strand == -1] = gene.pos.end[gene.mart$strand == -1] gRNA.pos = gRNA.mart$mid_position[gRNA.chr.id] - distance.temp = sapply(gene.tss, function(x){x - gRNA.pos}) - temp.id = which(abs(distance.temp) < 1000000, arr.ind = T) + distance.temp = sapply(gene.tss, function(x) {abs(x - gRNA.pos)}) + temp.id = which(distance.temp < 1000000, arr.ind = T) select.gRNA.gene.pair = rbind(select.gRNA.gene.pair, data.frame(gene.id = gene.mart$original.id[gene.chr.id[temp.id[, 2]]], gRNA.id = gRNA.id[gRNA.chr.id[temp.id[, 1]]])) # cat(chr, ' is done! \n') } + dim(select.gRNA.gene.pair) # 2376 pairs of gene and gRNA saveRDS(select.gRNA.gene.pair, file = paste0(processed_dir, "/select_gRNA_gene_pair.rds"))