Skip to content

Commit

Permalink
fix several bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
ZW-xjtlu committed Nov 29, 2019
1 parent a550d4a commit f097a89
Show file tree
Hide file tree
Showing 57 changed files with 645 additions and 615 deletions.
Binary file modified .DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
*.Rproj
8 changes: 6 additions & 2 deletions DESCRIPTION
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@ Title: exome-based anlaysis of MeRIP-Seq data: peak calling and
Version: 2.16.0
Date: 2017-10-24
Author: Lin Zhang <lin.zhang@cumt.edu.cn>, Lian Liu <liulian19860905@163.com>, Jia Meng <jia.meng@xjtlu.edu.cn>
Maintainer: Lin Zhang <lin.zhang@cumt.edu.cn>, Lian Liu <liulian19860905@163.com>, Jia Meng <jia.meng@xjtlu.edu.cn>
Maintainer: Lin Zhang <lin.zhang@cumt.edu.cn>, Lian Liu <liulian19860905@163.com>, Jia Meng <jia.meng@xjtlu.edu.cn>, Zhen Wei <zhen.wei@xjtlu.edu.cn>
Description: The package is developed for the analysis of affinity-based epitranscriptome shortgun sequencing data from MeRIP-seq (maA-seq). It was built on the basis of the exomePeak MATLAB package (Meng, Jia, et al. "Exome-based analysis for RNA epigenome sequencing data." Bioinformatics 29.12 (2013): 1565-1567.) with new functions for differential analysis of two experimental conditions to unveil the dynamics in post-transcriptional regulation of the RNA methylome. The exomePeak R-package accepts and statistically supports multiple biological replicates, internally removes PCR artifacts and multi-mapping reads, outputs exome-based binding sites (RNA methylation sites) and detects differential post-transcriptional RNA modification sites between two experimental conditions in term of percentage rather the absolute amount. The package is still under active development, and we welcome all biology and computation scientist for all kinds of collaborations and communications. Please feel free to contact Dr. Jia Meng <jia.meng@hotmail.com> if you have any questions.
License: GPL-2
Depends: Rsamtools, GenomicFeatures (>= 1.14.5), rtracklayer,
GenomicAlignments
biocViews: Sequencing, HighThroughputSequencing, Methylseq, RNAseq
Suggests:
knitr,
rmarkdown
biocViews: Sequencing, MethylSeq, RNASeq
NeedsCompilation: no
VignetteBuilder: knitr
Packaged: 2018-10-31 00:00:32 UTC; biocbuild
git_url: https://git.bioconductor.org/packages/exomePeak
git_branch: RELEASE_3_8
Expand Down
4 changes: 3 additions & 1 deletion NAMESPACE
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ export(rhtest)
export(exomepeak)
export(bltest)
export(RMT)

import(GenomicFeatures)
import(Rsamtools)
import(GenomicAlignments)
2 changes: 2 additions & 0 deletions NEWS
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ version 2.13.0: (2017-10-24)
- package overview revised
- revision due to the changes of a dependent package

version 2.14.0: (2019-11-29)
- fixed a few bugs, by Zhen Wei
107 changes: 53 additions & 54 deletions R/RMT.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
RMT <- function(
INPUT_BAM,
IP_BAM,
INPUT2IP = NA,
GENE_ANNO_GTF = NA,
GENOME = NA,
INPUT2IP = NA,
GENE_ANNO_GTF = NA,
GENOME = NA,
UCSC_TABLE_NAME = "knownGene",
TXDB = NA,
EXOME_OUTPUT_DIR = NA,
Expand All @@ -24,36 +24,35 @@ RMT <- function(
PARAMETERS$UCSC_TABLE_NAME=UCSC_TABLE_NAME
PARAMETERS$TXDB = TXDB


# dependent variables
if (is.na(PARAMETERS$EXOME_OUTPUT_DIR)) {
PARAMETERS$EXOME_OUTPUT_DIR <- getwd()
}
if (is.na(PARAMETERS$GO_OUTPUT_DIR)) {
PARAMETERS$GO_OUTPUT_DIR <- getwd()
}

# algrithm ##################################################
#peak calling
input_length <- length(PARAMETERS$INPUT_BAM)
ip_length <- length(PARAMETERS$IP_BAM)
all_filepath <- vector()

# decide whether paired
temp1 <- is.na(PARAMETERS$INPUT2IP)
flag <- temp1[1]


if(flag){
for(i in 1:ip_length){
experiment_name = paste(PARAMETERS$GO_EXPERIMENT_NAME,"/temp/IP_rep_",i,sep="")
all_filepath[i] = PARAMETERS$IP_BAM[i]
print(paste("Processing IP sample",i))
temp <- exomepeak(GENE_ANNO_GTF = GENE_ANNO_GTF,
temp <- exomepeak(GENE_ANNO_GTF = GENE_ANNO_GTF,
GENOME = GENOME, UCSC_TABLE_NAME = UCSC_TABLE_NAME,
TXDB = TXDB, IP_BAM = c(PARAMETERS$IP_BAM[i]),
INPUT_BAM = c(PARAMETERS$INPUT_BAM[i]),
OUTPUT_DIR = PARAMETERS$EXOME_OUTPUT_DIR,
TXDB = TXDB, IP_BAM = c(PARAMETERS$IP_BAM[i]),
INPUT_BAM = c(PARAMETERS$INPUT_BAM[i]),
OUTPUT_DIR = PARAMETERS$EXOME_OUTPUT_DIR,
POISSON_MEAN_RATIO = 1,
EXPERIMENT_NAME = experiment_name)
}
Expand All @@ -65,19 +64,19 @@ RMT <- function(
for(j in 2:length(PARAMETERS$INPUT2IP[[i]])){
input_bam=c(input_bam,c(PARAMETERS$INPUT_BAM[PARAMETERS$INPUT2IP[[i]][j]]))
}
}
}
all_filepath[i] <- paste(PARAMETERS$EXOME_OUTPUT_DIR,experiment_name,sep = '/')
print(paste("Processing IP sample",i))
temp <- exomepeak(GENE_ANNO_GTF = GENE_ANNO_GTF,
temp <- exomepeak(GENE_ANNO_GTF = GENE_ANNO_GTF,
GENOME = GENOME, UCSC_TABLE_NAME = UCSC_TABLE_NAME,
TXDB = TXDB, IP_BAM = c(PARAMETERS$IP_BAM[i]),
INPUT_BAM = input_bam,
OUTPUT_DIR = PARAMETERS$EXOME_OUTPUT_DIR,
POISSON_MEAN_RATIO = 1,
EXPERIMENT_NAME = experiment_name)
TXDB = TXDB, IP_BAM = c(PARAMETERS$IP_BAM[i]),
INPUT_BAM = input_bam,
OUTPUT_DIR = PARAMETERS$EXOME_OUTPUT_DIR,
POISSON_MEAN_RATIO = 1,
EXPERIMENT_NAME = experiment_name)
}
}

#merge all peak
for(i in 1:length(all_filepath)){
path <- paste(all_filepath[i],"peak.xls",sep = '/')
Expand All @@ -86,7 +85,7 @@ RMT <- function(
all_peak <- bam_file
} else {
all_peak = rbind(all_peak,bam_file[2:nrow(bam_file), ])
}
}
}
dir.create(paste(PARAMETERS$GO_OUTPUT_DIR,
PARAMETERS$GO_EXPERIMENT_NAME,sep = '/'),
Expand All @@ -96,28 +95,28 @@ RMT <- function(
write.table(all_peak,paste(dir,"all_peak.xls",sep = '/'),
sep = "\t",quote = FALSE,col.names = FALSE,
row.names = FALSE)

#read all peaks and transform GRangesList
filepath = paste(dir,"all_peak.xls",sep = '/')
matrix_peak_read = read.table(filepath,header = TRUE,stringsAsFactors = FALSE)

ori_peak <- .xls2Grangeslist(filepath)

#remove overlaps
overlaps <- findOverlaps(ori_peak,ori_peak)
matrix_overlap <- matrix(0,length(overlaps),2)
matrix_overlap[,1] <- queryHits(overlaps)
matrix_overlap[,2] <- subjectHits(overlaps)

num <- c(rep(0,length(ori_peak)))

for(i in 1:nrow(matrix_overlap)){
if(matrix_overlap[i, 1][1][[1]] != matrix_overlap[i, 2][1][[1]]){
x <- ranges(ori_peak[matrix_overlap[i, 1]])[[1]]@width
y <- ranges(ori_peak[matrix_overlap[i, 2]])[[1]]@width
#num[i]=if(x<y) matrix_overlap[i,1][1][[1]] else matrix_overlap[i,2][1][[1]]
if(x < y){
num[matrix_overlap[i, 1][1][[1]]] <- matrix_overlap[i, 1][1][[1]]
num[matrix_overlap[i, 1][1][[1]]] <- matrix_overlap[i, 1][1][[1]]
} else if(x > y){
num[matrix_overlap[i, 2][1][[1]]] <- matrix_overlap[i, 2][1][[1]]
} else {
Expand All @@ -130,12 +129,12 @@ RMT <- function(
tab <- which(num == 0)
write.table(matrix_peak_read[tab, ],paste(dir,"merged_peak.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)
write.table(matrix_peak_read[tab,1:12 ],paste(dir,"merged_peak.bed",sep = '/'),sep = "\t",quote = FALSE,col.names = FALSE,row.names = FALSE)


# read gene annotation from internet
txdb <- .readTxDb2(PARAMETERS)
exonRanges <- exonsBy(txdb, "gene")

#rpkm in all inputbams and ipbams
rpkm_input <- matrix(0,length(peak),length(PARAMETERS$INPUT_BAM))
rpkm_ip <- matrix(0,length(peak),length(PARAMETERS$IP_BAM))
Expand All @@ -147,7 +146,7 @@ RMT <- function(
aligns <- readGAlignments(filepath)
para <- ScanBamParam(what="mapq")
mapq <- scanBam(filepath, param=para)[[1]][[1]]
# filter reads with mapq smaller than 30.
# filter reads with mapq smaller than 30.
mapq[is.na(mapq)] <- 255 # Note: mapq "NA" means mapq = 255
ID_keep <- (mapq >30)
filtered <- aligns[ID_keep]
Expand All @@ -160,7 +159,7 @@ RMT <- function(
millionsMapped <- length(transcriptome_filtered_aligns) / 1000000
rpm <- counts / millionsMapped
rpkm_input[,i] <- rpm / geneLengthsInKB

}
write.table(count_input,paste(dir,"count_input.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)
write.table(rpkm_input,paste(dir,"rpkm_input.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)
Expand All @@ -170,7 +169,7 @@ RMT <- function(
aligns <- readGAlignments(filepath)
para <- ScanBamParam(what="mapq")
mapq <- scanBam(filepath, param=para)[[1]][[1]]
# filter reads with mapq smaller than 30.
# filter reads with mapq smaller than 30.
mapq[is.na(mapq)] <- 255 # Note: mapq "NA" means mapq = 255
ID_keep <- (mapq >30)
filtered <- aligns[ID_keep]
Expand All @@ -182,14 +181,14 @@ RMT <- function(
geneLengthsInKB <- numBases / 1000
millionsMapped <- length(transcriptome_filtered_aligns) / 1000000
rpm <- counts / millionsMapped
rpkm_ip[,i] <- rpm / geneLengthsInKB
rpkm_ip[,i] <- rpm / geneLengthsInKB
}
write.table(count_ip,paste(dir,"count_ip.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)
write.table(rpkm_ip,paste(dir,"rpkm_ip.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)

#return to all conditions
matrix_log2_fc = matrix(0,length(peak),ip_length)
if(is.na(PARAMETERS$INPUT2IP)){
if(any(is.na(PARAMETERS$INPUT2IP))){
for(i in 1:ip_length){
matrix_log2_fc[,i] <- log2(rpkm_ip[,i]+0.01)-log2(rpkm_input[,i]+0.01)
}
Expand All @@ -206,17 +205,17 @@ RMT <- function(
}
}
}

#feature selection
v <- matrix_log2_fc[,1]
for(i in 1:nrow(matrix_log2_fc)) {v[i] <- var(matrix_log2_fc[i,])}
# hist(v)
matrix_log2_fc <- cbind(tab,matrix_log2_fc[1:nrow(matrix_log2_fc),],matrix_peak_read[tab,])
write.table(matrix_log2_fc,paste(dir,"all_information.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)

# delete un-necessary files
file.remove(paste(dir,"all_peak.xls",sep = '/'))
file.remove(paste(dir,"all_information.xls",sep = '/'))
file.remove(paste(dir,"all_information.xls",sep = '/'))

# notification
print("***************************")
Expand All @@ -239,20 +238,20 @@ RMT <- function(
tablename=PARAMETERS$UCSC_TABLE_NAME)
options(op)
}

# use provided annotation data file
if (!is.na(PARAMETERS$GENE_ANNO_GTF) & is.na(PARAMETERS$TXDB) ) {
op <- options(warn = (-1))
txdb=makeTxDbFromGFF(PARAMETERS$GENE_ANNO_GTF,format="gtf")
options(op)
}

# use provided annotation data file
if (!is.na(PARAMETERS$TXDB) ) {
txdb=PARAMETERS$TXDB
}


# return data
return(txdb)
}
Expand All @@ -261,7 +260,7 @@ RMT <- function(
a = read.table(filepath,sep="\t",header=TRUE,stringsAsFactors =FALSE)
mcols_info =a[,13:length(a[1,])]
a = a[,1:12]

# get transcripts
no_tx = length(a[,1])
tx_id = 1:no_tx;
Expand All @@ -272,46 +271,46 @@ RMT <- function(
tx_end = a[,3]
transcripts= data.frame(tx_id,tx_name,tx_chrom,tx_strand,tx_start,tx_end)
head(transcripts)

# get splicings
splicing = data.frame()
for (i in 1:no_tx) {
tx = a[i,]
tx_id = i
exon_rank=1:as.integer(tx[10])

# get start
temp = as.integer(strsplit(as.character(tx[12]), ",")[[1]]) + tx_start[i]
exon_start=temp

# get end
temp = as.integer(strsplit(as.character(tx[11]), ",")[[1]])
temp2 = temp + exon_start - 1
exon_end=temp2

# get CDS
cds_start = exon_start
cds_end = exon_end

# get data frame
splicing_tx = data.frame(tx_id,exon_rank,exon_start,exon_end,cds_start,cds_end)

# collect result
splicing = rbind(splicing, splicing_tx)
}

# get genes
tx_name = tx_name
gene_id = as.character(a[,4])
gene_id[is.na(gene_id)]="NA"
gene=data.frame(tx_name,gene_id)
#

#

peaks = makeTxDb(transcripts=transcripts, splicings=splicing,
genes=gene)
tx <- exonsBy(peaks, "tx",use.names=TRUE)
mcols(tx) <- mcols_info

return(tx)
}
3 changes: 2 additions & 1 deletion R/exomepeak.R
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ exomepeak <- function(
for (ibatch in 1:no_batch){
reads_count_group=rbind(reads_count_group,temp[[ibatch]])
}
READS_COUNT=rbind(READS_COUNT,reads_count_group)}
READS_COUNT=rbind(READS_COUNT,reads_count_group)
}
READS_COUNT=data.frame(READS_COUNT)

# peak calling
Expand Down
7 changes: 5 additions & 2 deletions R/peak.calling.module.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,13 @@ for (iSample in 1:length(SAMPLE_ID$ip)) {
}

# output peaks
PEAK=list(PW=global.pw, Merged=global.sig.check.point.id, Consistent=consistent.sid)
PEAK = list(PW = global.pw,
Merged = global.sig.check.point.id,
Consistent = consistent.sid)

ID=PEAK$Merged
PEAK$loci2peak_merged=.get.peak.from.loci(READS_COUNT,ID,PARAMETERS)
ID=PEAK$Consistent
PEAK$loci2peak_consistent=.get.peak.from.loci(READS_COUNT,ID,PARAMETERS)

return(PEAK)}
return(PEAK)}
Empty file modified README.Rmd
100644 → 100755
Empty file.
Empty file modified README.md
100644 → 100755
Empty file.
Loading

0 comments on commit f097a89

Please sign in to comment.