fix several bugs

ZW-xjtlu · Nov 29, 2019 · f097a89 · f097a89
1 parent a550d4a
commit f097a89
Show file tree

Hide file tree

Showing 57 changed files with 645 additions and 615 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+*.Rproj
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,13 +5,17 @@ Title: exome-based anlaysis of MeRIP-Seq data: peak calling and
 Version: 2.16.0
 Date: 2017-10-24
 Author: Lin Zhang <lin.zhang@cumt.edu.cn>, Lian Liu <liulian19860905@163.com>, Jia Meng <jia.meng@xjtlu.edu.cn>
-Maintainer: Lin Zhang <lin.zhang@cumt.edu.cn>, Lian Liu <liulian19860905@163.com>, Jia Meng <jia.meng@xjtlu.edu.cn>
+Maintainer: Lin Zhang <lin.zhang@cumt.edu.cn>, Lian Liu <liulian19860905@163.com>, Jia Meng <jia.meng@xjtlu.edu.cn>, Zhen Wei <zhen.wei@xjtlu.edu.cn>
 Description: The package is developed for the analysis of affinity-based epitranscriptome shortgun sequencing data from MeRIP-seq (maA-seq). It was built on the basis of the exomePeak MATLAB package (Meng, Jia, et al. "Exome-based analysis for RNA epigenome sequencing data." Bioinformatics 29.12 (2013): 1565-1567.) with new functions for differential analysis of two experimental conditions to unveil the dynamics in post-transcriptional regulation of the RNA methylome. The exomePeak R-package accepts and statistically supports multiple biological replicates, internally removes PCR artifacts and multi-mapping reads, outputs exome-based binding sites (RNA methylation sites) and detects differential post-transcriptional RNA modification sites between two experimental conditions in term of percentage rather the absolute amount. The package is still under active development, and we welcome all biology and computation scientist for all kinds of collaborations and communications. Please feel free to contact Dr. Jia Meng <jia.meng@hotmail.com> if you have any questions.
 License: GPL-2
 Depends: Rsamtools, GenomicFeatures (>= 1.14.5), rtracklayer,
         GenomicAlignments
-biocViews: Sequencing, HighThroughputSequencing, Methylseq, RNAseq
+Suggests: 
+    knitr,
+    rmarkdown
+biocViews: Sequencing, MethylSeq, RNASeq
 NeedsCompilation: no
+VignetteBuilder: knitr
 Packaged: 2018-10-31 00:00:32 UTC; biocbuild
 git_url: https://git.bioconductor.org/packages/exomePeak
 git_branch: RELEASE_3_8

diff --git a/NAMESPACE b/NAMESPACE
@@ -4,4 +4,6 @@ export(rhtest)
 export(exomepeak)
 export(bltest)
 export(RMT)
-
+import(GenomicFeatures)
+import(Rsamtools)
+import(GenomicAlignments)
diff --git a/NEWS b/NEWS
@@ -22,3 +22,5 @@ version 2.13.0: (2017-10-24)
 - package overview revised 
 - revision due to the changes of a dependent package
 
+version 2.14.0: (2019-11-29)
+- fixed a few bugs, by Zhen Wei
diff --git a/R/RMT.R b/R/RMT.R
@@ -3,9 +3,9 @@
 RMT <- function(
   INPUT_BAM,
   IP_BAM,
-  INPUT2IP = NA, 
-  GENE_ANNO_GTF = NA, 
-  GENOME = NA, 
+  INPUT2IP = NA,
+  GENE_ANNO_GTF = NA,
+  GENOME = NA,
   UCSC_TABLE_NAME = "knownGene",
   TXDB = NA,
   EXOME_OUTPUT_DIR = NA,
@@ -24,36 +24,35 @@ RMT <- function(
   PARAMETERS$UCSC_TABLE_NAME=UCSC_TABLE_NAME
   PARAMETERS$TXDB = TXDB
 
-
   # dependent variables
   if (is.na(PARAMETERS$EXOME_OUTPUT_DIR)) {
     PARAMETERS$EXOME_OUTPUT_DIR <- getwd()
   }
   if (is.na(PARAMETERS$GO_OUTPUT_DIR)) {
     PARAMETERS$GO_OUTPUT_DIR <- getwd()
   }
-  
+
   # algrithm ##################################################
   #peak calling
   input_length <- length(PARAMETERS$INPUT_BAM)
   ip_length <- length(PARAMETERS$IP_BAM)
   all_filepath <- vector()
-  
+
   # decide whether paired
   temp1 <- is.na(PARAMETERS$INPUT2IP)
   flag <- temp1[1]
-  
-  
+
+
   if(flag){
     for(i in 1:ip_length){
       experiment_name = paste(PARAMETERS$GO_EXPERIMENT_NAME,"/temp/IP_rep_",i,sep="")
       all_filepath[i] = PARAMETERS$IP_BAM[i]
       print(paste("Processing IP sample",i))
-      temp <- exomepeak(GENE_ANNO_GTF = GENE_ANNO_GTF, 
+      temp <- exomepeak(GENE_ANNO_GTF = GENE_ANNO_GTF,
                         GENOME = GENOME, UCSC_TABLE_NAME = UCSC_TABLE_NAME,
-                        TXDB = TXDB, IP_BAM = c(PARAMETERS$IP_BAM[i]), 
-                        INPUT_BAM = c(PARAMETERS$INPUT_BAM[i]), 
-                        OUTPUT_DIR = PARAMETERS$EXOME_OUTPUT_DIR, 
+                        TXDB = TXDB, IP_BAM = c(PARAMETERS$IP_BAM[i]),
+                        INPUT_BAM = c(PARAMETERS$INPUT_BAM[i]),
+                        OUTPUT_DIR = PARAMETERS$EXOME_OUTPUT_DIR,
                         POISSON_MEAN_RATIO = 1,
                         EXPERIMENT_NAME = experiment_name)
     }
@@ -65,19 +64,19 @@ RMT <- function(
         for(j in 2:length(PARAMETERS$INPUT2IP[[i]])){
           input_bam=c(input_bam,c(PARAMETERS$INPUT_BAM[PARAMETERS$INPUT2IP[[i]][j]]))
         }
-      }      
+      }
       all_filepath[i] <- paste(PARAMETERS$EXOME_OUTPUT_DIR,experiment_name,sep = '/')
       print(paste("Processing IP sample",i))
-      temp <- exomepeak(GENE_ANNO_GTF = GENE_ANNO_GTF, 
+      temp <- exomepeak(GENE_ANNO_GTF = GENE_ANNO_GTF,
                         GENOME = GENOME, UCSC_TABLE_NAME = UCSC_TABLE_NAME,
-                        TXDB = TXDB, IP_BAM = c(PARAMETERS$IP_BAM[i]), 
-                        INPUT_BAM = input_bam, 
-                        OUTPUT_DIR = PARAMETERS$EXOME_OUTPUT_DIR, 
-                        POISSON_MEAN_RATIO = 1, 
-                        EXPERIMENT_NAME = experiment_name)   
+                        TXDB = TXDB, IP_BAM = c(PARAMETERS$IP_BAM[i]),
+                        INPUT_BAM = input_bam,
+                        OUTPUT_DIR = PARAMETERS$EXOME_OUTPUT_DIR,
+                        POISSON_MEAN_RATIO = 1,
+                        EXPERIMENT_NAME = experiment_name)
     }
   }
-  
+
   #merge all peak
   for(i in 1:length(all_filepath)){
     path <- paste(all_filepath[i],"peak.xls",sep = '/')
@@ -86,7 +85,7 @@ RMT <- function(
       all_peak <- bam_file
     } else {
       all_peak = rbind(all_peak,bam_file[2:nrow(bam_file), ])
-    } 
+    }
   }
   dir.create(paste(PARAMETERS$GO_OUTPUT_DIR,
                    PARAMETERS$GO_EXPERIMENT_NAME,sep = '/'),
@@ -96,28 +95,28 @@ RMT <- function(
   write.table(all_peak,paste(dir,"all_peak.xls",sep = '/'),
               sep = "\t",quote = FALSE,col.names = FALSE,
               row.names = FALSE)
-  
+
   #read all peaks and transform GRangesList
   filepath = paste(dir,"all_peak.xls",sep = '/')
   matrix_peak_read = read.table(filepath,header = TRUE,stringsAsFactors = FALSE)
-  
+
   ori_peak <- .xls2Grangeslist(filepath)
-  
+
   #remove overlaps
   overlaps <- findOverlaps(ori_peak,ori_peak)
   matrix_overlap <- matrix(0,length(overlaps),2)
   matrix_overlap[,1] <- queryHits(overlaps)
   matrix_overlap[,2] <- subjectHits(overlaps)
-  
+
   num <- c(rep(0,length(ori_peak)))
-  
+
   for(i in 1:nrow(matrix_overlap)){
     if(matrix_overlap[i, 1][1][[1]] != matrix_overlap[i, 2][1][[1]]){
       x <- ranges(ori_peak[matrix_overlap[i, 1]])[[1]]@width
       y <- ranges(ori_peak[matrix_overlap[i, 2]])[[1]]@width
       #num[i]=if(x<y) matrix_overlap[i,1][1][[1]] else matrix_overlap[i,2][1][[1]]
       if(x < y){
-        num[matrix_overlap[i, 1][1][[1]]] <- matrix_overlap[i, 1][1][[1]] 
+        num[matrix_overlap[i, 1][1][[1]]] <- matrix_overlap[i, 1][1][[1]]
       } else if(x > y){
         num[matrix_overlap[i, 2][1][[1]]] <- matrix_overlap[i, 2][1][[1]]
       } else {
@@ -130,12 +129,12 @@ RMT <- function(
   tab <- which(num == 0)
   write.table(matrix_peak_read[tab, ],paste(dir,"merged_peak.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)
   write.table(matrix_peak_read[tab,1:12 ],paste(dir,"merged_peak.bed",sep = '/'),sep = "\t",quote = FALSE,col.names = FALSE,row.names = FALSE)
-  
-  
+
+
   # read gene annotation from internet
   txdb <- .readTxDb2(PARAMETERS)
   exonRanges <- exonsBy(txdb, "gene")
-  
+
   #rpkm in all inputbams and ipbams
   rpkm_input <- matrix(0,length(peak),length(PARAMETERS$INPUT_BAM))
   rpkm_ip <- matrix(0,length(peak),length(PARAMETERS$IP_BAM))
@@ -147,7 +146,7 @@ RMT <- function(
     aligns <- readGAlignments(filepath)
     para <- ScanBamParam(what="mapq")
     mapq <- scanBam(filepath, param=para)[[1]][[1]]
-    # filter reads with mapq smaller than 30. 
+    # filter reads with mapq smaller than 30.
     mapq[is.na(mapq)] <- 255  # Note: mapq "NA" means mapq = 255
     ID_keep <- (mapq >30)
     filtered <- aligns[ID_keep]
@@ -160,7 +159,7 @@ RMT <- function(
     millionsMapped <- length(transcriptome_filtered_aligns) / 1000000
     rpm <- counts / millionsMapped
     rpkm_input[,i] <- rpm / geneLengthsInKB
-    
+
   }
   write.table(count_input,paste(dir,"count_input.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)
   write.table(rpkm_input,paste(dir,"rpkm_input.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)
@@ -170,7 +169,7 @@ RMT <- function(
     aligns <- readGAlignments(filepath)
     para <- ScanBamParam(what="mapq")
     mapq <- scanBam(filepath, param=para)[[1]][[1]]
-    # filter reads with mapq smaller than 30. 
+    # filter reads with mapq smaller than 30.
     mapq[is.na(mapq)] <- 255  # Note: mapq "NA" means mapq = 255
     ID_keep <- (mapq >30)
     filtered <- aligns[ID_keep]
@@ -182,14 +181,14 @@ RMT <- function(
     geneLengthsInKB <- numBases / 1000
     millionsMapped <- length(transcriptome_filtered_aligns) / 1000000
     rpm <- counts / millionsMapped
-    rpkm_ip[,i] <- rpm / geneLengthsInKB 
+    rpkm_ip[,i] <- rpm / geneLengthsInKB
   }
   write.table(count_ip,paste(dir,"count_ip.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)
   write.table(rpkm_ip,paste(dir,"rpkm_ip.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)
 
   #return to all conditions
   matrix_log2_fc = matrix(0,length(peak),ip_length)
-  if(is.na(PARAMETERS$INPUT2IP)){
+  if(any(is.na(PARAMETERS$INPUT2IP))){
     for(i in 1:ip_length){
       matrix_log2_fc[,i] <- log2(rpkm_ip[,i]+0.01)-log2(rpkm_input[,i]+0.01)
     }
@@ -206,17 +205,17 @@ RMT <- function(
       }
     }
   }
-  
+
   #feature selection
   v <- matrix_log2_fc[,1]
   for(i in 1:nrow(matrix_log2_fc)) {v[i] <- var(matrix_log2_fc[i,])}
   # hist(v)
   matrix_log2_fc <- cbind(tab,matrix_log2_fc[1:nrow(matrix_log2_fc),],matrix_peak_read[tab,])
   write.table(matrix_log2_fc,paste(dir,"all_information.xls",sep = '/'),sep = "\t",quote = FALSE,col.names = TRUE,row.names = FALSE)
-  
+
   # delete un-necessary files
   file.remove(paste(dir,"all_peak.xls",sep = '/'))
-  file.remove(paste(dir,"all_information.xls",sep = '/')) 
+  file.remove(paste(dir,"all_information.xls",sep = '/'))
 
   # notification
   print("***************************")
@@ -239,20 +238,20 @@ RMT <- function(
                            tablename=PARAMETERS$UCSC_TABLE_NAME)
     options(op)
   }
-  
+
   # use provided annotation data file
   if (!is.na(PARAMETERS$GENE_ANNO_GTF) & is.na(PARAMETERS$TXDB) ) {
     op <- options(warn = (-1))
     txdb=makeTxDbFromGFF(PARAMETERS$GENE_ANNO_GTF,format="gtf")
     options(op)
   }
-  
+
   # use provided annotation data file
   if (!is.na(PARAMETERS$TXDB) ) {
     txdb=PARAMETERS$TXDB
   }
-  
-  
+
+
   # return data
   return(txdb)
 }
@@ -261,7 +260,7 @@ RMT <- function(
     a = read.table(filepath,sep="\t",header=TRUE,stringsAsFactors =FALSE)
     mcols_info =a[,13:length(a[1,])]
     a = a[,1:12]
-    
+
     # get transcripts
     no_tx = length(a[,1])
     tx_id = 1:no_tx;
@@ -272,46 +271,46 @@ RMT <- function(
     tx_end = a[,3]
     transcripts= data.frame(tx_id,tx_name,tx_chrom,tx_strand,tx_start,tx_end)
     head(transcripts)
-    
+
     # get splicings
     splicing = data.frame()
     for (i in 1:no_tx) {
       tx = a[i,]
       tx_id = i
       exon_rank=1:as.integer(tx[10])
-      
+
       # get start
       temp = as.integer(strsplit(as.character(tx[12]), ",")[[1]]) + tx_start[i]
       exon_start=temp
-      
+
       # get end
       temp = as.integer(strsplit(as.character(tx[11]), ",")[[1]])
       temp2 = temp + exon_start - 1
       exon_end=temp2
-      
+
       # get CDS
       cds_start = exon_start
       cds_end = exon_end
-      
+
       # get data frame
       splicing_tx = data.frame(tx_id,exon_rank,exon_start,exon_end,cds_start,cds_end)
-      
+
       # collect result
       splicing = rbind(splicing, splicing_tx)
     }
-    
+
     # get genes
     tx_name = tx_name
     gene_id = as.character(a[,4])
     gene_id[is.na(gene_id)]="NA"
     gene=data.frame(tx_name,gene_id)
-    
-    # 
-    
+
+    #
+
     peaks = makeTxDb(transcripts=transcripts, splicings=splicing,
                              genes=gene)
     tx <- exonsBy(peaks, "tx",use.names=TRUE)
     mcols(tx) <- mcols_info
-    
+
     return(tx)
   }
diff --git a/R/exomepeak.R b/R/exomepeak.R
@@ -125,7 +125,8 @@ exomepeak <- function(
     for (ibatch in 1:no_batch){
       reads_count_group=rbind(reads_count_group,temp[[ibatch]])
     }
-    READS_COUNT=rbind(READS_COUNT,reads_count_group)}
+    READS_COUNT=rbind(READS_COUNT,reads_count_group)
+    }
   READS_COUNT=data.frame(READS_COUNT)
 
   # peak calling

diff --git a/R/peak.calling.module.R b/R/peak.calling.module.R
@@ -23,10 +23,13 @@ for (iSample in 1:length(SAMPLE_ID$ip)) {
 }
 
 # output peaks
-PEAK=list(PW=global.pw, Merged=global.sig.check.point.id, Consistent=consistent.sid)
+PEAK = list(PW = global.pw,
+            Merged = global.sig.check.point.id,
+            Consistent = consistent.sid)
+
 ID=PEAK$Merged
 PEAK$loci2peak_merged=.get.peak.from.loci(READS_COUNT,ID,PARAMETERS)
 ID=PEAK$Consistent
 PEAK$loci2peak_consistent=.get.peak.from.loci(READS_COUNT,ID,PARAMETERS)
 
-return(PEAK)}
+return(PEAK)}
diff --git a/README.Rmd b/README.Rmd
diff --git a/README.md b/README.md