Skip to content

Commit

Permalink
use more liberal threshold of 0.005 to select genes
Browse files Browse the repository at this point in the history
  • Loading branch information
Tim committed Sep 3, 2021
1 parent c97ae1d commit b6978b6
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,21 @@ source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/pa
barcodes_gene <- readRDS(paste0(processed_dir, "/cell_barcodes_gene.rds"))

# Compute the number of UMIs in each cell
exp_mat_t <- readRDS(paste0(processed_dir, "/exp_mat_t_metadata.rds")) %>% sceptre::load_fbm()
exp_mat_metadata_t <- readRDS(paste0(processed_dir, "/exp_mat_t_metadata.rds"))
exp_mat_metadata_t$backingfile <- paste0(processed_dir, "/expression_matrix_t")
exp_mat_t <- exp_mat_metadata_t %>% sceptre::load_fbm()

n_umis_per_cell <- big_apply(exp_mat_t, function(X, ind) {colSums(X[,ind])}) %>% unlist()
gene_covariate_matrix <- data.frame(cell_barcode = barcodes_gene, log_n_umis = n_umis_per_cell)
gene_barcode_original_order <- gene_covariate_matrix$cell_barcode

# Compute the mean expression of each gene
gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS"))
exp_mat <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds")) %>% load_fbm()
exp_mat_metadata <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds"))
exp_mat_metadata$backingfile <- paste0(processed_dir, "/expression_matrix")
exp_mat <- load_fbm(exp_mat_metadata)
gene_expression_p <- big_apply(exp_mat, function(X, ind) colMeans(X[,ind] >= 1)) %>% unlist()
highly_expressed_genes <- gene_ids[which(gene_expression_p >= 0.08)]
highly_expressed_genes <- gene_ids[which(gene_expression_p >= 0.005)] # threshold of 0.005 (1/2%)

##########################################
# Load gRNA data and gRNA covariate matrix
Expand Down Expand Up @@ -57,7 +62,6 @@ gRNA_indic_mat <- fst::write_fst(gRNA_indic_matrix_sub, paste0(analysis_ready_di
# save subsetted cell-by-gene expression matrix
###############################################
gene_ids <- readRDS(paste0(processed_dir, "/ordered_gene_ids.RDS"))
exp_mat <- readRDS(paste0(processed_dir, "/exp_mat_metadata.rds")) %>% load_fbm()
exp_mat_mem <- exp_mat[,seq(1, ncol(exp_mat))]
exp_mat_sub <- exp_mat_mem[cell_subset,]
exp_mat_sub_disk <- FBM(nrow = nrow(exp_mat_sub),
Expand All @@ -72,4 +76,3 @@ exp_mat_sub_metadata <- list(nrow = nrow(exp_mat_sub),
type = "unsigned short",
backingfile = paste0(analysis_ready_dir, "/expression_matrix_sub"))
saveRDS(object = exp_mat_sub_metadata, paste0(analysis_ready_dir, "/exp_mat_sub_metadata.rds"))

Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ library(biomaRt)
#####################
# 1. gene positions
#####################
gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds')) # 5947 genes (now 2437 genes)
gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds')) # 5129
gene.ensembl.id <- lapply(strsplit(gene.id, '[.]'), function(x){x[1]}) %>% unlist()
ensembl <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
ensembl.37 <- biomaRt::useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", GRCh = 37)
Expand Down Expand Up @@ -59,7 +59,6 @@ saveRDS(gRNA.mart, file = paste0(processed_dir, "/gRNA_mart.rds"))
###################################
# 3. gene-potential enhancer pairs
###################################

# From Gasperini paper, the distance between gRNA and gene is calculated as (TSS of gene - midpoint of gRNA).
# The selection is performed chr by chr
chr.select = intersect(gRNA.mart$chr, gene.mart$chr)
Expand All @@ -85,7 +84,7 @@ for(chr in chr.select){
# cat(chr, ' is done! \n')
}
dim(select.gRNA.gene.pair)
# 3530 pairs of gene and gRNA (now 1221)
# 2376 pairs of gene and gRNA
saveRDS(select.gRNA.gene.pair, file = paste0(processed_dir, "/select_gRNA_gene_pair.rds"))

### sanity check: are the genes and gRNAs on the same chromosome?
Expand All @@ -107,7 +106,7 @@ for(i in 1:length(tf.ensembl.id)){
n = sum(tf.gene$Animal.TFDB == tf.ensembl.id[i])
if(n > 1){
cat(i, '\t')
if(tf.ensembl.id[i] != ''){
if(tf.ensembl.id[i] != '') {
delete.tf.line = c(delete.tf.line, which(tf.gene$Animal.TFDB == tf.ensembl.id[i])[1])
}
}
Expand Down Expand Up @@ -217,7 +216,7 @@ df2_cis_pairs <- select.gRNA.gene.pair %>% rename("gRNA_id" = "gRNA.id", "gene_i
bulk_regions <- readRDS(paste0(processed_dir, "/bulk_region_names.rds")) %>% filter(region_name %in% c("ARL15-enh", "MYB-enh-3"))
df3_bulk_validation <- expand.grid(gene_id = unique(c(bulk_regions$targeted_gene_id, gene.id)),
gRNA_id = bulk_regions$region) %>% mutate(type = "bulk_validation")
fst::write_fst(x = df3_bulk_validation, path = paste0(processed_dir, "/gRNA_gene_pairs_bulk_validation.fst"))
# fst::write_fst(x = df3_bulk_validation, path = paste0(processed_dir, "/gRNA_gene_pairs_bulk_validation.fst"))

all_pairs <- rbind(df1_neg_control, df2_cis_pairs, df3_bulk_validation)
all_pairs_f <- mutate_all(all_pairs, factor)
Expand Down

0 comments on commit b6978b6

Please sign in to comment.