Skip to content

Commit

Permalink
added createPairs, computefeaturesGeneric and computeFeaturesCellTYpe
Browse files Browse the repository at this point in the history
  • Loading branch information
saral98 committed Jul 11, 2022
1 parent 50a6906 commit 8f04d83
Show file tree
Hide file tree
Showing 17 changed files with 667 additions and 368 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Version: 0.0.0.9000
Authors@R:
person(given = "Sara",
family = "Lopez Ruiz de Vargas",
role = c("cre"),
role = c("cre", "mantainer"),
email = "lopez_s@molgen.mpg.de",
comment = c(ORCID = "YOUR-ORCID-ID"))
person(given = "Trisevgeni",
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Generated by roxygen2: do not edit by hand

export(compute_features)
export(computeCellTypeFeatures)
export(createPairs)
3 changes: 3 additions & 0 deletions R/centrePrediction.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
centrePrediction <- function(features_generic, features_celltype, model){

}
141 changes: 141 additions & 0 deletions R/computeCellTypeFeatures.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#' Compute cell type specific features
#'
#' Computes the cell type specific features needed for the CENTRE classification
#' step.
#'
#' @param file Path to a file with gene and enhancer pairs.
#' @param metaData Dataframe indicating the paths to the ChIP-seq experiments.
#' More information on the format here `crupR::normalize()`
#' @param cores Number of cores to compute the CRUP score features
#' @param tpmpath Path to a file with the RNA-seq TPM values, with the names of
#' the genes given as ENSEMBLE ID's
#'
#' @return
#' A table containting the following computed features :
#'* CRUP enhancer score for enhancer region, promoter region and the region between the enhancer and the promoter
#'* CRUP promoter score for enhancer region, promoter region and the region between the enhancer and the promoter
#'* RNA-seq TPM values
#'
#'
#' @export
#'
#' @examples
#'files <- c(
#'"/project/CRUP_scores/CRUP_scores/ENCODE/Single_ended/Tissues/body_pancreas/H3K4me1/H3K4me1.bam",
#'"/project/CRUP_scores/CRUP_scores/ENCODE/Single_ended/Tissues/body_pancreas/H3K4me3/H3K4me3.bam",
#'"/project/CRUP_scores/CRUP_scores/ENCODE/Single_ended/Tissues/body_pancreas/H3K27ac/H3K27ac.bam" )
#'
#'features_generics <- "/project/CRUP_scores/CRUP_scores/ENCODE/Single_ended/Tissues/body_pancreas/Controls/features_generic.bam"
#'
#'metaData <- data.frame(HM = c("H3K4me1","H3K4me3","H3K27ac"),
#' condition = c(1,1,1), replicate = c(1,1,1),
#' bamFile = files, features_genericFile = rep(features_generics,3))
#'file <- "/project/CRUP_scores/EPI/example1.txt"
#'compute_features(file, metaData, cores)
#'
#'
computeCellTypeFeatures <- function(metaData, cores, sequencing, tpmpath,
features_generic){
## Pre-eliminary checks and computations
check_file(tpmpath)
tpmfile <- read.table(tpmpath, sep = "", stringsAsFactors = F, header = T)


## Computing the crup scores
startPart("Computing CRUP score features")

## Calling normalization step only on the chromosomes we have
chr <- unique(features_generic$chr)
print(head(features_generic))
normalized <- crupR::normalize(metaData = metaData,
condition = 1,
replicate = 1,
genome = "hg38",
sequencing = "single",
chroms = chr,
cores = cores)
#Get CRUP enhancer probabilities
crup_scores_enh <- crupR::getEnhancers(data = normalized, cores = cores)
crup_scores_enh <- crup_scores_enh$data_matrix


##Making the ranges for the enhancers
list_enh <- as.data.frame(unique(features_generic$enhancer_id))
colnames(list_enh) <- c("enhancer_id")

regions_enhancer <- merge(list_enh,
ccres_enhancer[,c('V1', 'V5', 'new_start', 'new_end')],
by.x='enhancer_id',
by.y='V5')


##Making the ranges for the genes

list_prom <- as.data.frame(unique(features_generic$gene_id2))
colnames(list_prom) <- c("gene_id2")
regions_prom <- merge(list_prom,
gencode[,c('chr','gene_id1','new_start', 'new_end')],
by.x='gene_id2',
by.y='gene_id1')


cat("Getting the CRUP-EP scores for enhancer, promoter and the regulatory
distance")

crup_EP_enh <- compute_crup_enhancer(regions_enhancer, list_enh, crup_scores_enh)

crup_features <-merge(features_generic,
crup_EP_enh,
by.x="enhancer_id",
by.y="cres_name",
all.x=TRUE)

crup_EP_prom <- compute_crup_promoter(regions_prom,list_prom, crup_scores_enh)

crup_features <-merge(crup_features,
crup_EP_prom,
by.x="gene_id2",
by.y="gene_name",
all.x=TRUE)

crup_features <- compute_crup_reg_distance(crup_features, crup_scores_enh)


##Get CRUP promoter probabilities
crup_scores_prom <- crupR::getEnhancers(data = normalized, cores = cores, promprob = T)
crup_scores_prom <- crup_scores_prom$data_matrix

cat("Getting the CRUP-PP scores for enhancer")



crup_PP_enh <- compute_crup_enhancer(regions_enhancer, list_enh, crup_scores_prom)

crup_features <-merge(crup_features,
crup_PP_enh,
by.x="enhancer_id",
by.y="cres_name",
all.x=TRUE)

crup_PP_prom <- compute_crup_promoter(regions_prom, list_prom, crup_scores_prom)
crup_features <-merge(crup_features,
crup_PP_prom,
by.x="gene_id2",
by.y="gene_name",
all.x=TRUE)

crup_features <- compute_crup_reg_distance(crup_features, crup_scores_prom)


endPart()

startPart("Getting the TPM values")
features_table_all <- get_rnaseq(crup_features, tpmfile)


features_table_all[is.na(features_table_all)] <- 0

endPart()
return(features_table_all)

}
70 changes: 70 additions & 0 deletions R/computeGenericFeatures.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#' Compute generic features
#'
#' Computes the generic features needed by CENTRE to make predictions
#'
#' @param file Path to a file with either genes and enhancer pairs.
#' @param metaData Dataframe indicating the paths to the ChIP-seq experiments.
#' More information on the format here `crupR::normalize()`
#'
#'
#' @return
#' A table containting the following computed features :
#'* distance: Distance between gene and enhancer
#'* combined_tests: Combined value of the Wilcoxon tests (CAGE, DNase
#'expression, CRUP expression and DNase DNase)
#'* crup_COR: CRUP correlation scores
#'
#' @examples
#'
#'
#'
computeGenericFeatures <- function(file){
## Pre-eliminary checks and computations

check_file(file)
x <- read.table(file, sep = "\t", stringsAsFactors = F)

x$gene_id1 <- gsub("\\..*","",x[,1])

## Computing the distance features

startPart("Computing distance features")


colnames(x) <- c("gene_id", "enhancer_id", "gene_id2")
features_distances <- distances_gene_enhancer(x)

cat("Removing pairs with distance over 500 Kb")
features_distances <- features_distances[abs(features_distances$distance)
<= 500000,]
endPart()

## Getting the values for the Wilcoxon tests and the CRUP correlations
startPart("Get Wilcoxon tests and CRUP correlations")

features_distances$pair <- paste(features_distances$enhancer_id,
features_distances$gene_id2,
sep = "_")
wilcoxon_features <- wilcoxon_test_crup_cor(features_distances)

## Combining the values of the Wilcoxon tests

wilcoxon_features$combined_tests <- scran::combinePValues(
wilcoxon_features$cage_wilcoxon_test,
wilcoxon_features$dhsexp_wilcoxon_test,
wilcoxon_features$crupexp_wilcoxon_test,
wilcoxon_features$dhsdhs_wilcoxon_test,
method="fisher")

wilcoxon_features$combined_tests <-log(wilcoxon_features$combined_tests)

## Return the table of features
features_table <- wilcoxon_features[,c("gene_id2", "enhancer_id", "chr",
"middle_point", "transcription_start",
"distance","cor_CRUP", "combined_tests")]

features_table[is.na(features_table)] <- 0

return(features_table)

}
6 changes: 2 additions & 4 deletions R/compute_crup_ep.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ compute_crup_EP_reg_distance <- function(input, prediction) {
EP_reg_distance = GenomicRanges::elementMetadata(prediction)$score[hits_enh@to])

bins<-as.data.frame(table(cres_EP$between))
print(bins)


cres_EP1<-cres_EP[cres_EP$EP_prob>0.5,]
Expand All @@ -112,7 +111,6 @@ compute_crup_EP_reg_distance <- function(input, prediction) {
bins_pos<-as.data.frame(table(cres_EP1$between))
}

print(bins_pos)

all_bins<-merge(bins,bins_pos,by.x="Var1",by.y="Var1",all.x=TRUE)
all_bins[is.na(all_bins)] <- 0
Expand Down Expand Up @@ -205,7 +203,7 @@ compute_crup_PP_reg_distance <- function(input, prediction) {
PP_reg_distance = GenomicRanges::elementMetadata(prediction)$score[hits_enh@to])

bins<-as.data.frame(table(cres_EP$between))
print(bins)



cres_EP1<-cres_EP[cres_EP$EP_prob>0.5,]
Expand All @@ -232,7 +230,7 @@ compute_crup_PP_reg_distance <- function(input, prediction) {

reg_distance <- cbind(reg_dist_enh, norm_reg_dist_enh)

print(reg_distance)

return(reg_distance)

}
Expand Down
103 changes: 0 additions & 103 deletions R/compute_features.R

This file was deleted.

Loading

0 comments on commit 8f04d83

Please sign in to comment.