finish fixing preprocessing scripts 1-5

Katsevich-Lab · Aug 29, 2021 · 236be99 · 236be99
1 parent d199828
commit 236be99
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 68 deletions.
diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/pre_process_data_3.R
@@ -96,83 +96,109 @@ code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
 offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
 source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))
 
-genes_in_use <- readRDS(paste0(processed_dir, "/ordered_genes.RDS"))
-genes_in_use_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS"))
-
-# Load the gRNA identification information; we save the regions of ARL15-enh and MYB-enh-1-4.
+# Save the bulk region names
 enh_targets_df <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/enh_targets.xlsx"), sheet = 1)
 bulk_region_names <- filter(enh_targets_df, gene_names %in% c("ARL15", "MYB")) %>%
   select(region, region_name = Denoted.Region.Name.used.in.the.paper, targeted_gene = gene_names) %>%
   filter(region_name %in% c("ARL15-enh", "MYB-enh-3"))
 saveRDS(object = bulk_region_names, paste0(processed_dir, "/bulk_region_names.rds"))
 
-guide_seqs <- read.xlsx(xlsxFile = paste0(raw_data_dir, "/all_oligos.xlsx"), sheet = 1) %>% rename(hg38_enh_region = "region.pos.(hg38)")
-target_regions <- guide_seqs %>% pull(hg38_enh_region) %>% unique()
-names(target_regions) <- target_regions
-
-# Next, we create a data frame containing UMI counts for all targeted putative enhancers
+####################################################
+# Next, we create a data frame containing UMI counts
+####################################################
+guide_seqs <- openxlsx::read.xlsx(xlsxFile = paste0(raw_data_dir, "/all_oligos.xlsx"),
+                                  sheet = 1) %>% dplyr::select(hg38_enh_region = "region.pos.(hg38)",
+                                                               spacer_seq = "spacer.sequence")
+weird_spacers <- names(which(table(guide_seqs$spacer_seq) >= 2))
+guide_seqs <- dplyr::filter(guide_seqs, !(spacer_seq %in% weird_spacers))
+all(table(guide_seqs$spacer_seq) == 1); all(table(guide_seqs$hg38_enh_region) == 10)
+
+# get the raw file names
 raw_fs <- list.files(raw_data_dir)
 gRNA_files <- paste0(raw_data_dir, "/", grep(pattern = "sgRNA-enrichment_5K-sgRNAs_Batch", x = raw_fs, value = TRUE))
-batch_lane <- stringr::str_extract(string = gRNA_files, pattern = "Batch_[0-9]_[0-9]") %>%
+
+# get the batch ID of each file
+batch <- stringr::str_extract(string = gRNA_files, pattern = "Batch_[0-9]_[0-9]") %>%
   gsub(pattern = "Batch_", replacement = "")
 
-# Run this part in parallel
-plan(multisession, workers = 10)
-res <- future_map(.x = seq(1L, length(gRNA_files)), .f = function(i) {
+# construct a sparse matrix of gRNA counts for each file
+res <- lapply(X = seq(1L, length(gRNA_files)), FUN = function(i) {
   curr_file <- gRNA_files[i]
-  curr_batch_lane <- batch_lane[i]
+  curr_batch <- batch[i]
   print(paste("Working on file", curr_file))
-  curr_gRNA_count_matrix <- read_tsv(file = curr_file, col_names = c("cell_barcode", "total_read_count", "total_umi_count", "gRNA_spacer_seqs", "read_counts", "umi_counts"), col_types = c("cccccc")) %>% select(cell_barcode, gRNA_spacer_seqs, umi_counts, total_umi_count)
-  cell_barcodes <- pull(curr_gRNA_count_matrix, cell_barcode)
-  # check for duplicates
-  if (any(duplicated(cell_barcodes))) stop("Duplicated barcodes.")
-  cell_barcodes <- paste0(cell_barcodes, "-1_", curr_batch_lane)
+  curr_gRNA_count_matrix <- readr::read_tsv(file = curr_file,
+                                            col_names = c("cell_barcode", "total_read_count", "total_umi_count", "gRNA_spacer_seqs", "read_counts", "umi_counts"),
+                                            col_types = c("cccccc")) %>% dplyr::select(cell_barcode, gRNA_spacer_seqs, umi_counts, total_umi_count)
+  cell_barcodes <- curr_gRNA_count_matrix$cell_barcode
+  # convert to sparse triplet matrix
+  umi_count_list <- lapply(curr_gRNA_count_matrix$umi_counts, function(v) stringr::str_split(v, pattern = ";") %>% unlist() %>% as.integer())
+  gRNA_spacer_seq_list <- lapply(curr_gRNA_count_matrix$gRNA_spacer_seqs, function(v) stringr::str_split(v, pattern = ";") %>% unlist())
+  n_entries_per_cell <- sapply(X = umi_count_list, function(v) length(v))
+  barcode_vector <- rep(x = cell_barcodes, times = n_entries_per_cell)
+  gRNA_spacer_seq_vector <- unlist(gRNA_spacer_seq_list)
+  umi_count_vector <- unlist(umi_count_list)
+  ch_df <- data.frame(barcode = barcode_vector,
+                      spacer_seq = gRNA_spacer_seq_vector,
+                      umi_count = umi_count_vector)
+  # remove all entries with spacer seqs not in the spacer seq list
+  ch_df <- ch_df %>% dplyr::filter(gRNA_spacer_seq_vector %in% guide_seqs$spacer_seq)
+  ordered_barcode_vector <- sort(unique(ch_df$barcode))
+
+  spacer_seq_idx <- match(x = ch_df$spacer_seq, guide_seqs$spacer_seq)
+  barcode_idx <- match(x = ch_df$barcode, table = ordered_barcode_vector)
 
-  count_matrix_list <- map(target_regions, function(region) {
-    cat(paste0("Working on region ", region, ".\n"))
-    region_spacer_seqs <- filter(guide_seqs, hg38_enh_region == region) %>% pull(spacer.sequence)
-    curr_batch_gRNA_umi_counts <- sapply(X = 1:nrow(curr_gRNA_count_matrix), FUN = function(row_id) {
-      r <- curr_gRNA_count_matrix[row_id,]
-      spacers <- str_split(r$gRNA_spacer_seqs, pattern = ";") %>% unlist()
-      umi_counts <- str_split(r$umi_counts, pattern = ";") %>% unlist() %>% as.integer()
-      umi_locs <- match(x = region_spacer_seqs, table = spacers)
-      curr_counts <- sapply(umi_locs, function(i) if (is.na(i)) 0 else umi_counts[i])
-      names(curr_counts) <- region_spacer_seqs
-      curr_counts
-    }) %>% t() %>% Matrix(sparse = TRUE)
-  })
-  list(cell_barcodes = cell_barcodes, count_matrix_list = count_matrix_list, total_umis = as.integer(curr_gRNA_count_matrix$total_umi_count))
+  m <- Matrix::sparseMatrix(i = spacer_seq_idx,
+                            j = barcode_idx,
+                            x = ch_df$umi_count,
+                            dims = c(length(guide_seqs$spacer_seq),
+                                     length(ordered_barcode_vector)))
+  rownames(m) <- guide_seqs$spacer_seq
+  colnames(m) <- paste0(ordered_barcode_vector, "-1_", curr_batch)
+  # obtain the gRNA UMI counts and cell count
+  curr_gRNA_count_matrix <- as.data.frame(curr_gRNA_count_matrix)
+  rownames(curr_gRNA_count_matrix) <- curr_gRNA_count_matrix$cell_barcode
+  curr_gRNA_count_matrix_sub <- curr_gRNA_count_matrix[ordered_barcode_vector,]
+  umi_counts <- as.integer(curr_gRNA_count_matrix_sub$total_umi_count)
+  n_cells <- length(umi_counts)
+  return(list(m = m, umi_counts = umi_counts, n_cells = n_cells))
 })
 
-gRNA_count_matrix_list <- map(target_regions, function(region) {
-  map(res, function(x) x$count_matrix_list[[region]]) %>% reduce(.f = rbind)
-})
+combined_m <- do.call(what = cbind, args = lapply(res, function(i) i$m))
+all(Matrix::colSums(combined_m) >= 1); all(Matrix::rowSums(combined_m) >= 1)
+
+# Finally, perform the grouping operation
+gRNA_matrix_raw <- combined_m
+unique_regions <- unique(guide_seqs$hg38_enh_region)
+
+thresholded_ungrouped_mat <- apply(X = gRNA_matrix_raw, MARGIN = 1, FUN = function(r) {
+  r_nonzero <- r[r >= 1]
+  as.integer(r > mean(r_nonzero))
+}) %>% Matrix::t()
+colnames(thresholded_ungrouped_mat) <- colnames(gRNA_matrix_raw)
+
+thresholded_grouped_mat <- sapply(X = unique_regions, FUN = function(curr_region) {
+  print(curr_region)
+  curr_spacer_seqs <- dplyr::filter(guide_seqs, hg38_enh_region == curr_region) %>%
+    dplyr::pull(spacer_seq)
+  curr_m <- thresholded_ungrouped_mat[curr_spacer_seqs,]
+  apply(X = curr_m, MARGIN = 2, FUN = function(col) max(col))
+}) %>% t()
+
+saveRDS(thresholded_grouped_mat, paste0(processed_dir, "/gRNA_indicator_matrix_unordered.rds"))
 
-cell_barcodes <- map(.x = res, .f = function(x) x$cell_barcodes) %>% reduce(.f = c)
-cell_counts <- sapply(X = res, FUN = function(x) length(x$cell_barcodes))
-cell_gRNA_umi_counts <- map(.x = res, function(x) x$total_umis) %>% reduce(.f = c)
-
-# We reduce each matrix in the gRNA_count_matrix_list to a single logical vector
-combine_gRNAs_in_group <- function(gRNA_count_matrix) {
-  apply(X = gRNA_count_matrix, MARGIN = 2, FUN = function(column) {
-    v <- sum(column >= 1)
-    U <- sum(column)
-    column/U > 1/v
-  }) %>% apply(MARGIN = 1, FUN = function(r) any(r))
-}
-gRNA_indic_matrix <- map_dfr(gRNA_count_matrix_list, combine_gRNAs_in_group) %>% as.data.frame()
-rownames(gRNA_indic_matrix) <- cell_barcodes
-saveRDS(object = gRNA_indic_matrix, file = paste0(processed_dir, "/gRNA_indicator_matrix_unordered.rds"))
 
-# create the cell covariate gRNA matrix
-cell_covariate_matrix <- data.frame(cell_barcode = cell_barcodes,
+# compute the cell covariate matrix
+cell_gRNA_umi_counts <- lapply(res, function(i) i$umi_counts) %>% do.call(what = "c", args = .)
+cell_counts <- lapply(res, function(i) i$n_cells) %>% unlist()
+cell_covariate_matrix <- data.frame(cell_barcode = colnames(thresholded_grouped_mat),
                                     cell_gRNA_umi_counts = cell_gRNA_umi_counts,
-                                    batch = rep(batch_lane, times = cell_counts))
+                                    batch = rep(batch, times = cell_counts))
 saveRDS(object = cell_covariate_matrix, file = paste0(processed_dir, "/cell_covariate_matrix_gRNA.rds"))
 
 ##############
 # Bulk RNA-seq
 ##############
+library(openxlsx)
 rm(list=ls())
 code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
 offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")

diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R
@@ -30,7 +30,7 @@ rownames(gRNA_covariate_matrix) <- gRNA_covariate_matrix$cell_barcode
 cells_intersect <- intersect(x = gene_barcode_original_order, gRNA_covariate_matrix$cell_barcode)
 
 # subset the gRNA indicator matrix and covariate matrix appropriately
-gRNA_indic_matrix_sub <- gRNA_indic_matrix[cells_intersect,]
+gRNA_indic_matrix_sub <- as.data.frame(gRNA_indic_matrix[,cells_intersect] %>% t())
 gRNA_covariate_matrix_sub <- gRNA_covariate_matrix[cells_intersect,]
 
 # get the cell subset ordering

diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/sceptre_function_args.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/sceptre_function_args.R
@@ -26,3 +26,14 @@ if (small_example) {
   gRNA_gene_pairs <- slice_sample(gRNA_gene_pairs, n = 20)
   pod_sizes <- c(gene = 2, gRNA = 5, pair = 5)
 }
+
+
+# test ARL15-enh against ARL15
+if (FALSE) {
+  expressions <- cell_gene_expression_matrix[, ordered_gene_ids == "ENSG00000185305.10"]
+  indicators <- read_fst(gRNA_indicator_matrix_fp, columns = "chr5:54325645-54326045") %>% dplyr::pull() %>% as.integer()
+  fit <- run_sceptre_gRNA_gene_pair(expressions = expressions,
+                             gRNA_indicators = indicators,
+                             covariate_matrix = covariate_matrix,
+                             B = 500) 
+}
diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R
@@ -13,22 +13,29 @@ library(biomaRt)
 # 1. gene positions 
 #####################
 gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds'))   # 5947 genes (now 2437 genes)
-gene.ensembl.id <- lapply(strsplit(gene.id, '[.]'), function(x){x[1]}) %>% unlist
-# The gene ID is from GENCODE, which is not excatly the same as ENSEMBL ID. The part before [.] is the same as ENSEMBL ID.
-
-ensembl <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
-ensembl.37 <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", GRCh = 37)
-
-temp <- getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol','chromosome_name','start_position','end_position', 'strand'), 
-              mart = ensembl, useCache = FALSE)
+gene.ensembl.id <- lapply(strsplit(gene.id, '[.]'), function(x){x[1]}) %>% unlist()
+ensembl <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
+ensembl.37 <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", GRCh = 37)
+temp <- biomaRt::getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol', 'chromosome_name',
+                                    'start_position', 'end_position', 'strand'),
+                       mart = ensembl, useCache = FALSE)
 gene.mart <- temp[match(gene.ensembl.id, temp$ensembl_gene_id), ]  # Match ensembl id with chromosome positions.
+gene.mart$original.id <- gene.id
 
 # some gene id is from GRCh37, which has been deleted from CRCh38. We don't want to miss them.
-left_gene <- gene.ensembl.id[is.na(gene.mart$ensembl_gene_id)]
-temp.37 <- getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol','chromosome_name','start_position','end_position', 'strand'), 
-                filters = 'ensembl_gene_id', values = left_gene, mart = ensembl.37, useCache = FALSE)
-gene.mart[is.na(gene.mart$ensembl_gene_id), ] <- temp.37
+na_gene_mart_idx <- which(is.na(gene.mart$ensembl_gene_id))
+left_gene <- gene.ensembl.id[na_gene_mart_idx]
+temp.37 <- biomaRt::getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol','chromosome_name',
+                                         'start_position', 'end_position', 'strand'),
+                          filters = 'ensembl_gene_id', values = left_gene, mart = ensembl.37, useCache = FALSE)
+gene.mart.left <- temp.37[match(left_gene, temp.37$ensembl_gene_id),]
+gene.mart.left$original.id <- gene.id[na_gene_mart_idx]
+
+# finally, remove the NA entries, and combine gene.mart and gene.mart.left
+gene.mart <- na.omit(gene.mart); gene.mart.left <- na.omit(gene.mart.left)
+gene.mart <- rbind(gene.mart, gene.mart.left)
 gene.mart$chr <- as.factor(paste0('chr', gene.mart$chromosome_name))
+row.names(gene.mart) <- NULL
 saveRDS(gene.mart, file = paste0(processed_dir, "/gene_mart.rds"))
 
 
@@ -72,7 +79,8 @@ for(chr in chr.select){
   distance.temp = sapply(gene.tss, function(x){x - gRNA.pos})
   temp.id = which(abs(distance.temp) < 1000000, arr.ind = T)
   select.gRNA.gene.pair = rbind(select.gRNA.gene.pair, 
-                                data.frame(gene.id = gene.id[gene.chr.id[temp.id[, 2]]], gRNA.id = gRNA.id[gRNA.chr.id[temp.id[, 1]]]))
+                                data.frame(gene.id = gene.mart$original.id[gene.chr.id[temp.id[, 2]]],
+                                           gRNA.id = gRNA.id[gRNA.chr.id[temp.id[, 1]]]))
   # cat(chr, ' is done! \n')
 }
 dim(select.gRNA.gene.pair)
@@ -202,3 +210,11 @@ all_pairs <- rbind(df1_neg_control, df2_cis_pairs, df3_bulk_validation)
 all_pairs_f <- mutate_all(all_pairs, factor)
 
 write_fst(x = all_pairs_f, path = paste0(processed_dir, "/gRNA_gene_pairs.fst")) 
+
+########################################################################################
+# Sanity check: for a few cis pairs, verify the gene and gRNA are on the same chromosome
+########################################################################################
+set.seed(4)
+test_df <- dplyr::filter(all_pairs_f, type == "cis") %>% dplyr::sample_n(5) %>% dplyr::arrange(gene_id)
+test_df_genes <- lapply(strsplit(as.character(test_df$gene_id), '[.]'), function(x){x[1]}) %>% unlist()
+dplyr::filter(gene.mart, ensembl_gene_id %in% test_df_genes) %>% dplyr::arrange(ensembl_gene_id) # OK