diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R index 753f78d..0f25052 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/quality_control_4.R @@ -8,16 +8,21 @@ source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/pa barcodes_gene <- readRDS(paste0(processed_dir, "/cell_barcodes_gene.rds")) # Compute the number of UMIs in each cell -exp_mat_t <- readRDS(paste0(processed_dir, "/exp_mat_t_metadata.rds")) %>% sceptre::load_fbm() +exp_mat_metadata_t <- readRDS(paste0(processed_dir, "/exp_mat_t_metadata.rds")) +exp_mat_metadata_t$backingfile <- paste0(processed_dir, "/expression_matrix_t") +exp_mat_t <- exp_mat_metadata_t %>% sceptre::load_fbm() + n_umis_per_cell <- big_apply(exp_mat_t, function(X, ind) {colSums(X[,ind])}) %>% unlist() gene_covariate_matrix <- data.frame(cell_barcode = barcodes_gene, log_n_umis = n_umis_per_cell) gene_barcode_original_order <- gene_covariate_matrix$cell_barcode # Compute the mean expression of each gene gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS")) -exp_mat <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds")) %>% load_fbm() +exp_mat_metadata <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds")) +exp_mat_metadata$backingfile <- paste0(processed_dir, "/expression_matrix") +exp_mat <- load_fbm(exp_mat_metadata) gene_expression_p <- big_apply(exp_mat, function(X, ind) colMeans(X[,ind] >= 1)) %>% unlist() -highly_expressed_genes <- gene_ids[which(gene_expression_p >= 0.08)] +highly_expressed_genes <- gene_ids[which(gene_expression_p >= 0.005)] # threshold of 0.005 (1/2%) ########################################## # Load gRNA data and gRNA covariate matrix @@ -57,7 +62,6 @@ gRNA_indic_mat <- fst::write_fst(gRNA_indic_matrix_sub, paste0(analysis_ready_di # save subsetted cell-by-gene expression matrix ############################################### gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS")) -exp_mat <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds")) %>% load_fbm() exp_mat_mem <- exp_mat[,seq(1, ncol(exp_mat))] exp_mat_sub <- exp_mat_mem[cell_subset,] exp_mat_sub_disk <- FBM(nrow = nrow(exp_mat_sub), @@ -72,4 +76,3 @@ exp_mat_sub_metadata <- list(nrow = nrow(exp_mat_sub), type = "unsigned short", backingfile = paste0(analysis_ready_dir, "/expression_matrix_sub")) saveRDS(object = exp_mat_sub_metadata, paste0(analysis_ready_dir, "/exp_mat_sub_metadata.rds")) - diff --git a/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R b/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R index 8f3cce0..0eff249 100644 --- a/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R +++ b/sceptre_paper/analysis_drivers/analysis_drivers_xie/select_gRNA_gene_pair_5.R @@ -13,7 +13,7 @@ library(biomaRt) ##################### # 1. gene positions ##################### -gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds')) # 5947 genes (now 2437 genes) +gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds')) # 5129 gene.ensembl.id <- lapply(strsplit(gene.id, '[.]'), function(x){x[1]}) %>% unlist() ensembl <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl") ensembl.37 <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", GRCh = 37) @@ -59,7 +59,6 @@ saveRDS(gRNA.mart, file = paste0(processed_dir, "/gRNA_mart.rds")) ################################### # 3. gene-potential enhancer pairs ################################### - # From Gasperini paper, the distance between gRNA and gene is calculated as (TSS of gene - midpoint of gRNA). # The selection is performed chr by chr chr.select = intersect(gRNA.mart$chr, gene.mart$chr) @@ -85,7 +84,7 @@ for(chr in chr.select){ # cat(chr, ' is done! \n') } dim(select.gRNA.gene.pair) -# 3530 pairs of gene and gRNA (now 1221) +# 2376 pairs of gene and gRNA saveRDS(select.gRNA.gene.pair, file = paste0(processed_dir, "/select_gRNA_gene_pair.rds")) ### sanity check: are the genes and gRNAs on the same chromosome? @@ -107,7 +106,7 @@ for(i in 1:length(tf.ensembl.id)){ n = sum(tf.gene$Animal.TFDB == tf.ensembl.id[i]) if(n > 1){ cat(i, '\t') - if(tf.ensembl.id[i] != ''){ + if(tf.ensembl.id[i] != '') { delete.tf.line = c(delete.tf.line, which(tf.gene$Animal.TFDB == tf.ensembl.id[i])[1]) } } @@ -217,7 +216,7 @@ df2_cis_pairs <- select.gRNA.gene.pair %>% rename("gRNA_id" = "gRNA.id", "gene_i bulk_regions <- readRDS(paste0(processed_dir, "/bulk_region_names.rds")) %>% filter(region_name %in% c("ARL15-enh", "MYB-enh-3")) df3_bulk_validation <- expand.grid(gene_id = unique(c(bulk_regions$targeted_gene_id, gene.id)), gRNA_id = bulk_regions$region) %>% mutate(type = "bulk_validation") -fst::write_fst(x = df3_bulk_validation, path = paste0(processed_dir, "/gRNA_gene_pairs_bulk_validation.fst")) +# fst::write_fst(x = df3_bulk_validation, path = paste0(processed_dir, "/gRNA_gene_pairs_bulk_validation.fst")) all_pairs <- rbind(df1_neg_control, df2_cis_pairs, df3_bulk_validation) all_pairs_f <- mutate_all(all_pairs, factor)