Skip to content

Commit

Permalink
load gRNA matrix from analysis_ready dir
Browse files Browse the repository at this point in the history
  • Loading branch information
Tim committed Aug 30, 2021
1 parent 6033300 commit e69f91f
Showing 1 changed file with 28 additions and 27 deletions.
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
code_dir <- paste0(.get_config_path("LOCAL_CODE_DIR"), "sceptre-manuscript")
offsite_dir <- .get_config_path("LOCAL_SCEPTRE_DATA_DIR")
source(paste0(code_dir, "/sceptre_paper/analysis_drivers/analysis_drivers_xie/paths_to_dirs.R"))
analysis_ready_dir <- paste0(offsite_dir, "/data/xie/analysis_ready")

# load packages
# load packages
library(fst)
library(dplyr)
library(biomaRt)

# Both gene ID and gRNA position are get from GRCh 38.
# Both gene ID and gRNA position are get from GRCh 38.

#####################
# 1. gene positions
# 1. gene positions
#####################
gene.id <- readRDS(paste0(processed_dir, '/highly_expressed_genes.rds')) # 5947 genes (now 2437 genes)
gene.ensembl.id <- lapply(strsplit(gene.id, '[.]'), function(x){x[1]}) %>% unlist()
Expand Down Expand Up @@ -42,7 +43,7 @@ saveRDS(gene.mart, file = paste0(processed_dir, "/gene_mart.rds"))
###########################
# 2. guide RNA positions
###########################
gRNA_indic_mat <- read.fst(paste0(processed_dir, "/gRNA_indicator_matrix.fst"))
gRNA_indic_mat <- read.fst(paste0(analysis_ready_dir, "/gRNA_indicator_matrix.fst"))
gRNA.id <- colnames(gRNA_indic_mat)
temp.gRNA = strsplit(gRNA.id, ':')
gRNA.mart = data.frame(chr = lapply(temp.gRNA, function(x){x[1]}) %>% unlist)
Expand All @@ -59,11 +60,11 @@ saveRDS(gRNA.mart, file = paste0(processed_dir, "/gRNA_mart.rds"))
# 3. gene-potential enhancer pairs
###################################

# From Gasperini paper, the distance between gRNA and gene is calculated as (TSS of gene - midpoint of gRNA).
# From Gasperini paper, the distance between gRNA and gene is calculated as (TSS of gene - midpoint of gRNA).
# The selection is performed chr by chr
chr.select = intersect(gRNA.mart$chr, gene.mart$chr)
# "chr1" "chr10" "chr11" "chr12" "chr16" "chr17" "chr18" "chr19" "chr2"
# "chr20" "chr3" "chr4" "chr5" "chr6" "chr7" "chr8" "chrX"
# "chr1" "chr10" "chr11" "chr12" "chr16" "chr17" "chr18" "chr19" "chr2"
# "chr20" "chr3" "chr4" "chr5" "chr6" "chr7" "chr8" "chrX"
# 17 chromosomes

select.gRNA.gene.pair = NULL
Expand All @@ -78,7 +79,7 @@ for(chr in chr.select){
gRNA.pos = gRNA.mart$mid_position[gRNA.chr.id]
distance.temp = sapply(gene.tss, function(x){x - gRNA.pos})
temp.id = which(abs(distance.temp) < 1000000, arr.ind = T)
select.gRNA.gene.pair = rbind(select.gRNA.gene.pair,
select.gRNA.gene.pair = rbind(select.gRNA.gene.pair,
data.frame(gene.id = gene.mart$original.id[gene.chr.id[temp.id[, 2]]],
gRNA.id = gRNA.id[gRNA.chr.id[temp.id[, 1]]]))
# cat(chr, ' is done! \n')
Expand All @@ -88,7 +89,7 @@ dim(select.gRNA.gene.pair)
saveRDS(select.gRNA.gene.pair, file = paste0(processed_dir, "/select_gRNA_gene_pair.rds"))


####### Select transcription factor(TF) and their potiential enhancer
####### Select transcription factor(TF) and their potiential enhancer
##############################
# 4. select genes that are TF
##############################
Expand All @@ -105,7 +106,7 @@ for(i in 1:length(tf.ensembl.id)){
}
}
}
# 24 35 183 923 1661
# 24 35 183 923 1661
# delete tf.gene lines: 24, 193, 989, 1771

tf.gene.new = tf.gene[-delete.tf.line, ]
Expand All @@ -119,7 +120,7 @@ saveRDS(tf.gene.select, file = paste0(processed_dir, "/tf_gene_select.rds"))


######################################################
# 5. Find gRNAs that are pontential enhancers for TF
# 5. Find gRNAs that are pontential enhancers for TF
######################################################
### Find gRNAs that might be potential enhancers
tf.match.temp <- cbind(match(tf.gene.new$Gene, temp$hgnc_symbol), match(tf.gene.new$Animal.TFDB, temp$ensembl_gene_id))
Expand All @@ -134,8 +135,8 @@ tf.gene.mart$chr <- paste0('chr', tf.gene.mart$chromosome_name)
saveRDS(tf.gene.mart, file = paste0(processed_dir, "/tf_gene_mart.rds"))

chr.select <- intersect(tf.gene.mart$chr, gRNA.mart$chr) # 17 chromosome intersect
# [1] "chr16" "chr7" "chr17" "chr19" "chr12" "chr2" "chr8" "chr20" "chr18"
# "chr4" "chrX" "chr5" "chr1" "chr11" "chr10" "chr3" "chr6"
# [1] "chr16" "chr7" "chr17" "chr19" "chr12" "chr2" "chr8" "chr20" "chr18"
# "chr4" "chrX" "chr5" "chr1" "chr11" "chr10" "chr3" "chr6"
select.gRNA.tf.pair <- NULL

for(chr in chr.select){
Expand All @@ -148,12 +149,12 @@ for(chr in chr.select){
gene.tss[gene.strand == -1] = gene.pos.end[gene.strand == -1]
gRNA.pos = gRNA.mart$mid_position[gRNA.chr.id]
distance.temp = sapply(gene.tss, function(x){x - gRNA.pos})
temp.id = which(abs(distance.temp) < 1000000, arr.ind = T)
temp.id = which(abs(distance.temp) < 1000000, arr.ind = T)
# 1. Distance less than 1MB; 2. gRNA is in the front of gene region
select.gRNA.tf.pair = rbind(select.gRNA.tf.pair,
data.frame(ensembl.gene.id = tf.gene.mart$ensembl_gene_id[gene.chr.id[temp.id[, 2]]],
hgnc_symbol = tf.gene.mart$hgnc_symbol[gene.chr.id[temp.id[, 2]]],

select.gRNA.tf.pair = rbind(select.gRNA.tf.pair,
data.frame(ensembl.gene.id = tf.gene.mart$ensembl_gene_id[gene.chr.id[temp.id[, 2]]],
hgnc_symbol = tf.gene.mart$hgnc_symbol[gene.chr.id[temp.id[, 2]]],
gRNA.id = gRNA.id[gRNA.chr.id[temp.id[, 1]]]))
cat(chr, ' is done! \n')
}
Expand All @@ -165,7 +166,7 @@ saveRDS(select.gRNA.tf.pair, file = paste0(processed_dir, "/select_gRNA_tf_pair.
##############################
### gRNA that are not close to any transcription factor, pair with the set of genes on different chormosomes.

chr.select <- intersect(tf.gene.mart$chr, gRNA.mart$chr)
chr.select <- intersect(tf.gene.mart$chr, gRNA.mart$chr)
num.neg.pair = NULL
neg.control.pair <- NULL
for(chr in chr.select){
Expand All @@ -176,17 +177,17 @@ for(chr in chr.select){
tf.strand <- tf.gene.mart$strand[tf.chr.id]
tf.tss <- tf.pos.start
tf.tss[tf.strand == -1] <- tf.pos.end[tf.strand == -1]

gRNA.pos <- gRNA.mart$mid_position[gRNA.chr.id]
distance.temp <- sapply(tf.tss, function(x){x - gRNA.pos})
temp.gRNA = gRNA.id[gRNA.chr.id[which(rowSums(abs(distance.temp) > 1000000) == length(tf.tss))]]
# Distance greater than 1MB
temp.gene.ensembl = gene.id[ gene.mart$chr != chr & !(gene.mart$hgnc_symbol %in% tf.gene.select) ]
temp.hgnc = gene.mart$hgnc_symbol[ gene.mart$chr != chr & !(gene.mart$hgnc_symbol %in% tf.gene.select)]
neg.control.pair <- rbind(neg.control.pair,
data.frame(gRNA.id = rep(temp.gRNA, length(temp.gene.ensembl)),
gene.id = rep(temp.gene.ensembl, each = length(temp.gRNA)),

neg.control.pair <- rbind(neg.control.pair,
data.frame(gRNA.id = rep(temp.gRNA, length(temp.gene.ensembl)),
gene.id = rep(temp.gene.ensembl, each = length(temp.gRNA)),
gene.hgnc.id = rep(temp.hgnc, each = length(temp.gRNA))))
num.neg.pair = rbind(num.neg.pair, data.frame(chr = chr, gRNA = length(temp.gRNA), gene = length(temp.gene.ensembl)))
cat(chr, 'have', length(temp.gRNA), 'gRNAs.', length(temp.gene.ensembl), ' genes not in ', chr, '. Done! \n')
Expand All @@ -199,17 +200,17 @@ saveRDS(num.neg.pair, file = paste0(processed_dir, '/num_neg_pair.rds'))
##############################################
set.seed(4)

df1_neg_control <- neg.control.pair %>% mutate(gene.hgnc.id = NULL) %>% rename("gRNA_id" = "gRNA.id", "gene_id" = "gene.id") %>%
df1_neg_control <- neg.control.pair %>% mutate(gene.hgnc.id = NULL) %>% rename("gRNA_id" = "gRNA.id", "gene_id" = "gene.id") %>%
group_by(gRNA_id) %>% slice_sample(n = 500) %>% mutate(type = "negative_control") %>% ungroup()
df2_cis_pairs <- select.gRNA.gene.pair %>% rename("gRNA_id" = "gRNA.id", "gene_id" = "gene.id") %>%
df2_cis_pairs <- select.gRNA.gene.pair %>% rename("gRNA_id" = "gRNA.id", "gene_id" = "gene.id") %>%
mutate(type = "cis")
bulk_regions <- readRDS(paste0(processed_dir, "/bulk_region_names.rds")) %>% filter(region_name %in% c("ARL15-enh", "MYB-enh-3"))
df3_bulk_validation <- expand.grid(gene_id = gene.id, gRNA_id = bulk_regions$region) %>% mutate(type = "bulk_validation")

all_pairs <- rbind(df1_neg_control, df2_cis_pairs, df3_bulk_validation)
all_pairs_f <- mutate_all(all_pairs, factor)

write_fst(x = all_pairs_f, path = paste0(processed_dir, "/gRNA_gene_pairs.fst"))
write_fst(x = all_pairs_f, path = paste0(processed_dir, "/gRNA_gene_pairs.fst"))

########################################################################################
# Sanity check: for a few cis pairs, verify the gene and gRNA are on the same chromosome
Expand Down

0 comments on commit e69f91f

Please sign in to comment.