use more liberal threshold of 0.005 to select genes

Katsevich-Lab · Sep 3, 2021 · b6978b6 · b6978b6
1 parent c97ae1d
commit b6978b6
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 10 deletions.
diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R
@@ -8,16 +8,21 @@ source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/pa
 barcodes_gene <- readRDS(paste0(processed_dir, "/cell_barcodes_gene.rds"))
 
 # Compute the number of UMIs in each cell
-exp_mat_t <- readRDS(paste0(processed_dir, "/exp_mat_t_metadata.rds")) %>% sceptre::load_fbm()
+exp_mat_metadata_t <- readRDS(paste0(processed_dir, "/exp_mat_t_metadata.rds"))
+exp_mat_metadata_t$backingfile <- paste0(processed_dir, "/expression_matrix_t")
+exp_mat_t <- exp_mat_metadata_t %>% sceptre::load_fbm()
+
 n_umis_per_cell <- big_apply(exp_mat_t, function(X, ind) {colSums(X[,ind])}) %>% unlist()
 gene_covariate_matrix <- data.frame(cell_barcode = barcodes_gene, log_n_umis = n_umis_per_cell)
 gene_barcode_original_order <- gene_covariate_matrix$cell_barcode
 
 # Compute the mean expression of each gene
 gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS"))
-exp_mat <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds")) %>% load_fbm()
+exp_mat_metadata <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds"))
+exp_mat_metadata$backingfile <- paste0(processed_dir, "/expression_matrix")
+exp_mat <- load_fbm(exp_mat_metadata)
 gene_expression_p <- big_apply(exp_mat, function(X, ind) colMeans(X[,ind] >= 1)) %>% unlist()
-highly_expressed_genes <- gene_ids[which(gene_expression_p >= 0.08)]
+highly_expressed_genes <- gene_ids[which(gene_expression_p >= 0.005)] # threshold of 0.005 (1/2%)
 
 ##########################################
 # Load gRNA data and gRNA covariate matrix
@@ -57,7 +62,6 @@ gRNA_indic_mat <- fst::write_fst(gRNA_indic_matrix_sub, paste0(analysis_ready_di
 # save subsetted cell-by-gene expression matrix
 ###############################################
 gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS"))
-exp_mat <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds")) %>% load_fbm()
 exp_mat_mem <- exp_mat[,seq(1, ncol(exp_mat))]
 exp_mat_sub <- exp_mat_mem[cell_subset,]
 exp_mat_sub_disk <- FBM(nrow = nrow(exp_mat_sub),
@@ -72,4 +76,3 @@ exp_mat_sub_metadata <- list(nrow = nrow(exp_mat_sub),
                              type = "unsigned short",
                              backingfile = paste0(analysis_ready_dir, "/expression_matrix_sub"))
 saveRDS(object = exp_mat_sub_metadata, paste0(analysis_ready_dir, "/exp_mat_sub_metadata.rds"))
-
diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R
@@ -13,7 +13,7 @@ library(biomaRt)
 #####################
 # 1. gene positions
 #####################
-gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds'))   # 5947 genes (now 2437 genes)
+gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds'))   # 5129
 gene.ensembl.id <- lapply(strsplit(gene.id, '[.]'), function(x){x[1]}) %>% unlist()
 ensembl <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
 ensembl.37 <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", GRCh = 37)
@@ -59,7 +59,6 @@ saveRDS(gRNA.mart, file = paste0(processed_dir, "/gRNA_mart.rds"))
 ###################################
 # 3. gene-potential enhancer pairs
 ###################################
-
 # From Gasperini paper, the distance between gRNA and gene is calculated as (TSS of gene - midpoint of gRNA).
 # The selection is performed chr by chr
 chr.select = intersect(gRNA.mart$chr, gene.mart$chr)
@@ -85,7 +84,7 @@ for(chr in chr.select){
   # cat(chr, ' is done! \n')
 }
 dim(select.gRNA.gene.pair)
-# 3530 pairs of gene and gRNA (now 1221)
+# 2376 pairs of gene and gRNA
 saveRDS(select.gRNA.gene.pair, file = paste0(processed_dir, "/select_gRNA_gene_pair.rds"))
 
 ### sanity check: are the genes and gRNAs on the same chromosome?
@@ -107,7 +106,7 @@ for(i in 1:length(tf.ensembl.id)){
   n = sum(tf.gene$Animal.TFDB == tf.ensembl.id[i])
   if(n > 1){
     cat(i, '\t')
-    if(tf.ensembl.id[i] != ''){
+    if(tf.ensembl.id[i] != '') {
       delete.tf.line = c(delete.tf.line, which(tf.gene$Animal.TFDB == tf.ensembl.id[i])[1])
     }
   }
@@ -217,7 +216,7 @@ df2_cis_pairs <- select.gRNA.gene.pair %>% rename("gRNA_id" = "gRNA.id", "gene_i
 bulk_regions <- readRDS(paste0(processed_dir, "/bulk_region_names.rds")) %>% filter(region_name %in% c("ARL15-enh", "MYB-enh-3"))
 df3_bulk_validation <- expand.grid(gene_id = unique(c(bulk_regions$targeted_gene_id, gene.id)),
                                    gRNA_id = bulk_regions$region) %>% mutate(type = "bulk_validation")
-fst::write_fst(x = df3_bulk_validation, path = paste0(processed_dir, "/gRNA_gene_pairs_bulk_validation.fst"))
+# fst::write_fst(x = df3_bulk_validation, path = paste0(processed_dir, "/gRNA_gene_pairs_bulk_validation.fst"))
 
 all_pairs <- rbind(df1_neg_control, df2_cis_pairs, df3_bulk_validation)
 all_pairs_f <- mutate_all(all_pairs, factor)