added createPairs, computefeaturesGeneric and computeFeaturesCellTYpe

slrvv · Jul 11, 2022 · 8f04d83 · 8f04d83
1 parent 50a6906
commit 8f04d83
Show file tree

Hide file tree

Showing 17 changed files with 667 additions and 368 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,7 +5,7 @@ Version: 0.0.0.9000
 Authors@R:
     person(given = "Sara",
            family = "Lopez Ruiz de Vargas",
-           role = c("cre"),
+           role = c("cre", "mantainer"),
            email = "lopez_s@molgen.mpg.de",
            comment = c(ORCID = "YOUR-ORCID-ID"))
     person(given = "Trisevgeni",

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,3 +1,4 @@
 # Generated by roxygen2: do not edit by hand
 
-export(compute_features)
+export(computeCellTypeFeatures)
+export(createPairs)
diff --git a/R/centrePrediction.R b/R/centrePrediction.R
@@ -0,0 +1,3 @@
+centrePrediction <- function(features_generic, features_celltype, model){
+
+}
diff --git a/R/computeCellTypeFeatures.R b/R/computeCellTypeFeatures.R
@@ -0,0 +1,141 @@
+#' Compute cell type specific features
+#'
+#' Computes the cell type specific features needed for the CENTRE classification
+#' step.
+#'
+#' @param file Path to a file with gene and enhancer pairs.
+#' @param metaData Dataframe indicating the paths to the ChIP-seq experiments.
+#' More information on the format here `crupR::normalize()`
+#' @param cores Number of cores to compute the CRUP score features
+#' @param tpmpath Path to a file with the RNA-seq TPM values, with the names of
+#' the genes given as ENSEMBLE ID's
+#'
+#' @return
+#' A table containting the following computed features :
+#'* CRUP enhancer score for enhancer region, promoter region and the region between the enhancer and the promoter
+#'* CRUP promoter score for enhancer region, promoter region and the region between the enhancer and the promoter
+#'* RNA-seq TPM values
+#'
+#'
+#' @export
+#'
+#' @examples
+#'files <- c(
+#'"/project/CRUP_scores/CRUP_scores/ENCODE/Single_ended/Tissues/body_pancreas/H3K4me1/H3K4me1.bam",
+#'"/project/CRUP_scores/CRUP_scores/ENCODE/Single_ended/Tissues/body_pancreas/H3K4me3/H3K4me3.bam",
+#'"/project/CRUP_scores/CRUP_scores/ENCODE/Single_ended/Tissues/body_pancreas/H3K27ac/H3K27ac.bam" )
+#'
+#'features_generics <- "/project/CRUP_scores/CRUP_scores/ENCODE/Single_ended/Tissues/body_pancreas/Controls/features_generic.bam"
+#'
+#'metaData <- data.frame(HM = c("H3K4me1","H3K4me3","H3K27ac"),
+#'                       condition = c(1,1,1), replicate = c(1,1,1),
+#'                       bamFile = files, features_genericFile = rep(features_generics,3))
+#'file <- "/project/CRUP_scores/EPI/example1.txt"
+#'compute_features(file, metaData, cores)
+#'
+#'
+computeCellTypeFeatures <- function(metaData, cores, sequencing, tpmpath,
+                                    features_generic){
+  ## Pre-eliminary checks and computations
+  check_file(tpmpath)
+  tpmfile <- read.table(tpmpath, sep = "", stringsAsFactors = F, header = T)
+
+
+  ## Computing the crup scores
+  startPart("Computing CRUP score features")
+
+  ## Calling normalization step only on the chromosomes we have
+  chr <- unique(features_generic$chr)
+  print(head(features_generic))
+  normalized <- crupR::normalize(metaData = metaData,
+                                 condition = 1,
+                                 replicate = 1,
+                                 genome = "hg38",
+                                 sequencing = "single",
+                                 chroms = chr,
+                                 cores = cores)
+  #Get CRUP enhancer probabilities
+  crup_scores_enh <- crupR::getEnhancers(data = normalized, cores = cores)
+  crup_scores_enh <- crup_scores_enh$data_matrix
+
+
+  ##Making the ranges for the enhancers
+  list_enh <- as.data.frame(unique(features_generic$enhancer_id))
+  colnames(list_enh) <- c("enhancer_id")
+
+  regions_enhancer <-  merge(list_enh,
+                    ccres_enhancer[,c('V1', 'V5', 'new_start', 'new_end')],
+                    by.x='enhancer_id',
+                    by.y='V5')
+
+
+  ##Making the ranges for the genes
+
+  list_prom <- as.data.frame(unique(features_generic$gene_id2))
+  colnames(list_prom) <- c("gene_id2")
+  regions_prom <- merge(list_prom,
+                   gencode[,c('chr','gene_id1','new_start', 'new_end')],
+                   by.x='gene_id2',
+                   by.y='gene_id1')
+
+
+  cat("Getting the CRUP-EP scores for enhancer, promoter and the regulatory
+      distance")
+
+  crup_EP_enh <- compute_crup_enhancer(regions_enhancer, list_enh, crup_scores_enh)
+
+  crup_features <-merge(features_generic,
+                        crup_EP_enh,
+                        by.x="enhancer_id",
+                        by.y="cres_name",
+                        all.x=TRUE)
+
+  crup_EP_prom <- compute_crup_promoter(regions_prom,list_prom, crup_scores_enh)
+
+  crup_features <-merge(crup_features,
+                        crup_EP_prom,
+                        by.x="gene_id2",
+                        by.y="gene_name",
+                        all.x=TRUE)
+
+  crup_features <- compute_crup_reg_distance(crup_features, crup_scores_enh)
+
+
+  ##Get CRUP promoter probabilities
+  crup_scores_prom <- crupR::getEnhancers(data = normalized, cores = cores, promprob = T)
+  crup_scores_prom <- crup_scores_prom$data_matrix
+
+  cat("Getting the CRUP-PP scores for enhancer")
+
+
+
+  crup_PP_enh <- compute_crup_enhancer(regions_enhancer, list_enh, crup_scores_prom)
+
+  crup_features <-merge(crup_features,
+                        crup_PP_enh,
+                        by.x="enhancer_id",
+                        by.y="cres_name",
+                        all.x=TRUE)
+
+  crup_PP_prom <- compute_crup_promoter(regions_prom, list_prom, crup_scores_prom)
+  crup_features <-merge(crup_features,
+                        crup_PP_prom,
+                        by.x="gene_id2",
+                        by.y="gene_name",
+                        all.x=TRUE)
+
+  crup_features <- compute_crup_reg_distance(crup_features, crup_scores_prom)
+
+
+  endPart()
+
+  startPart("Getting the TPM values")
+  features_table_all <- get_rnaseq(crup_features, tpmfile)
+
+
+  features_table_all[is.na(features_table_all)] <- 0
+
+  endPart()
+  return(features_table_all)
+
+}
diff --git a/R/computeGenericFeatures.R b/R/computeGenericFeatures.R
@@ -0,0 +1,70 @@
+#' Compute generic features
+#'
+#' Computes the generic features needed by CENTRE to make predictions
+#'
+#' @param file Path to a file with either genes and enhancer pairs.
+#' @param metaData Dataframe indicating the paths to the ChIP-seq experiments.
+#' More information on the format here `crupR::normalize()`
+#'
+#'
+#' @return
+#' A table containting the following computed features :
+#'* distance: Distance between gene and enhancer
+#'* combined_tests: Combined value of the Wilcoxon tests (CAGE, DNase
+#'expression, CRUP expression and DNase DNase)
+#'* crup_COR: CRUP correlation scores
+#'
+#' @examples
+#'
+#'
+#'
+computeGenericFeatures <- function(file){
+  ## Pre-eliminary checks and computations
+
+  check_file(file)
+  x <- read.table(file, sep = "\t", stringsAsFactors = F)
+
+  x$gene_id1 <- gsub("\\..*","",x[,1])
+
+  ## Computing the distance features
+
+  startPart("Computing distance features")
+
+
+  colnames(x) <- c("gene_id", "enhancer_id", "gene_id2")
+  features_distances <- distances_gene_enhancer(x)
+
+  cat("Removing pairs with distance over 500 Kb")
+  features_distances <- features_distances[abs(features_distances$distance)
+                                           <= 500000,]
+  endPart()
+
+  ## Getting the values for the Wilcoxon tests and the CRUP correlations
+  startPart("Get Wilcoxon tests and CRUP correlations")
+
+  features_distances$pair <- paste(features_distances$enhancer_id,
+                                   features_distances$gene_id2,
+                                   sep = "_")
+  wilcoxon_features <- wilcoxon_test_crup_cor(features_distances)
+
+  ## Combining the values of the Wilcoxon tests
+
+  wilcoxon_features$combined_tests <- scran::combinePValues(
+    wilcoxon_features$cage_wilcoxon_test,
+    wilcoxon_features$dhsexp_wilcoxon_test,
+    wilcoxon_features$crupexp_wilcoxon_test,
+    wilcoxon_features$dhsdhs_wilcoxon_test,
+    method="fisher")
+
+  wilcoxon_features$combined_tests <-log(wilcoxon_features$combined_tests)
+
+  ## Return the table of features
+  features_table <- wilcoxon_features[,c("gene_id2", "enhancer_id", "chr",
+                                         "middle_point", "transcription_start",
+                                         "distance","cor_CRUP", "combined_tests")]
+
+  features_table[is.na(features_table)] <- 0
+
+  return(features_table)
+
+}
diff --git a/R/compute_crup_ep.R b/R/compute_crup_ep.R
@@ -97,7 +97,6 @@ compute_crup_EP_reg_distance <- function(input, prediction) {
                         EP_reg_distance = GenomicRanges::elementMetadata(prediction)$score[hits_enh@to])
 
   bins<-as.data.frame(table(cres_EP$between))
-  print(bins)
 
 
   cres_EP1<-cres_EP[cres_EP$EP_prob>0.5,]
@@ -112,7 +111,6 @@ compute_crup_EP_reg_distance <- function(input, prediction) {
     bins_pos<-as.data.frame(table(cres_EP1$between))
   }
 
-  print(bins_pos)
 
   all_bins<-merge(bins,bins_pos,by.x="Var1",by.y="Var1",all.x=TRUE)
   all_bins[is.na(all_bins)] <- 0
@@ -205,7 +203,7 @@ compute_crup_PP_reg_distance <- function(input, prediction) {
               PP_reg_distance = GenomicRanges::elementMetadata(prediction)$score[hits_enh@to])
 
   bins<-as.data.frame(table(cres_EP$between))
-  print(bins)
+
 
 
   cres_EP1<-cres_EP[cres_EP$EP_prob>0.5,]
@@ -232,7 +230,7 @@ compute_crup_PP_reg_distance <- function(input, prediction) {
 
   reg_distance <- cbind(reg_dist_enh, norm_reg_dist_enh)
 
-  print(reg_distance)
+
   return(reg_distance)
 
 }

diff --git a/R/compute_features.R b/R/compute_features.R
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		centrePrediction <- function(features_generic, features_celltype, model){

		}