add vignette, biocCheck, cleanup code

PoisonAlien · Apr 2, 2016 · 3094327 · 3094327
1 parent 360aaf4
commit 3094327
Show file tree

Hide file tree

Showing 59 changed files with 1,253 additions and 281 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-.Rproj.user
+proj.user
 .Rhistory
 .RData
-inst/doc
+.Rproj.user
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,27 +1,31 @@
 Package: maftools
 Type: Package
 Title: maftools : Summarize, Analyze and Visualize MAF files.
-Version: 0.1
+Version: 0.99.0
 Date: 2015-12-14
-Author: Anand M <<anand_mt@hotmail.com>>
-Maintainer: Anand M <anand_mt@hotmail.com>
+Author: Anand Mayakonda <<anand_mt@hotmail.com>>
+Maintainer: Anand Mayakonda <anand_mt@hotmail.com>
 Description: Process MAF files.
 URL: https://github.com/PoisonAlien/maftools
 License: MIT
 LazyData: TRUE
-Depends: R (>= 3.1.2), data.table, ggplot2
+Depends: R (>= 3.2), data.table, ggplot2
 Imports:
-    plyr,
-    reshape,
     cowplot,
     cometExactTest,
     RColorBrewer,
     dplyr,
     gplots,
     NMF,
     ggrepel,
-    methods
+    methods,
+    ComplexHeatmap,
+    mclust,
+    VariantAnnotation,
+    Biostrings
 RoxygenNote: 5.0.1
 Suggests: knitr,
-    rmarkdown
+    rmarkdown,
+    testthat
 VignetteBuilder: knitr
+biocViews: DataRepresentation, DNASeq, Visualization, FeatureExtraction, Classification, SomaticMutation, Sequencing
diff --git a/NAMESPACE b/NAMESPACE
@@ -10,7 +10,7 @@ export(math.score)
 export(mutExclusive)
 export(oncodrive)
 export(oncoplot)
-export(oncoprint)
+export(oncostrip)
 export(oncotate)
 export(pfamDomains)
 export(plotOncodrive)
@@ -25,4 +25,17 @@ export(write.mafSummary)
 exportClasses(MAF)
 exportMethods(getGeneSummary)
 exportMethods(getSampleSummary)
+import(ComplexHeatmap)
+import(NMF)
+import(RColorBrewer)
+import(cometExactTest)
+import(cowplot)
+import(data.table)
+import(ggrepel)
+import(gplots)
+import(mclust)
 import(methods)
+importFrom(Biostrings,subseq)
+importFrom(VariantAnnotation,getSeq)
+importFrom(VariantAnnotation,seqlevels)
+importFrom(dplyr,filter)
diff --git a/R/TrinucleotideMatrix.R b/R/TrinucleotideMatrix.R
@@ -9,12 +9,16 @@
 #' @param ignoreChr Chromsomes to remove from analysis. e.g. chrM
 #' @param useSyn Logical. Whether to include synonymous variants in analysis. Defaults to FALSE.
 #' @return A matrix of dimension nx96, where n is the number of samples in the MAF.
+#' @examples
+#' laml.tnm <- trinucleotideMatrix(maf = laml, ref_genome = 'hg19.fa', prefix = 'chr', add = T, useSyn = T)
+#' @importFrom VariantAnnotation getSeq seqlevels
+#' @importFrom Biostrings subseq
 #' @export
 
-trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignoreChr = NULL, useSyn = F){
+trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignoreChr = NULL, useSyn = FALSE){
 
-  suppressPackageStartupMessages(require('VariantAnnotation', quietly = T))
-  suppressPackageStartupMessages(require('Biostrings', quietly = T))
+  #suppressPackageStartupMessages(require('VariantAnnotation', quietly = TRUE))
+  #suppressPackageStartupMessages(require('Biostrings', quietly = TRUE))
 
   #Synonymous variants
   maf.silent = maf@maf.silent
@@ -26,20 +30,20 @@ trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignor
   maf = maf[!Variant_Classification %in% silent] #Remove silent variants from main table
 
   if(useSyn){
-    maf = rbind(maf, maf.silent, fill = T)
+    maf = rbind(maf, maf.silent, fill = TRUE)
   }
 
   #one bp up and down.
   up = down = 1
 
   #reate a reference to an indexed fasta file.
-  ref = FaFile(file = ref_genome)
+  ref = Rsamtools::FaFile(file = ref_genome)
 
   if(!is.null(prefix)){
     if(add){
       maf$Chromosome = paste(prefix,maf$Chromosome, sep = '')
     }else{
-      maf$Chromosome = gsub(pattern = prefix, replacement = '', x = maf$Chromosome, fixed = T)
+      maf$Chromosome = gsub(pattern = prefix, replacement = '', x = maf$Chromosome, fixed = TRUE)
     }
   }
 
@@ -66,10 +70,10 @@ trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignor
 
   #read fasta
   message('reading fasta (this might take a while)..')
-  ref = getSeq(x = ref)
+  ref = VariantAnnotation::getSeq(x = ref)
 
   #extract contigs from reference fasta
-  seq.lvl = seqlevels(ref)
+  seq.lvl = VariantAnnotation::seqlevels(ref)
 
   chrs.missing = chrs[!chrs %in% seq.lvl]
 
@@ -89,7 +93,7 @@ trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignor
                            Reference_Allele = maf.snp$Reference_Allele, Tumor_Seq_Allele2 = maf.snp$Tumor_Seq_Allele2, Tumor_Sample_Barcode = maf.snp$Tumor_Sample_Barcode)
 
   message('Extracting adjacent bases..')
-  ss = subseq(x = ref[extract.tbl[,Chromosome]], start = extract.tbl[,Start] , end = extract.tbl[,End])
+  ss = Biostrings::subseq(x = ref[extract.tbl[,Chromosome]], start = extract.tbl[,Start] , end = extract.tbl[,End])
   extract.tbl[,trinucleotide:= as.character(ss)]
   extract.tbl[,Substitution:=paste(extract.tbl$Reference_Allele, extract.tbl$Tumor_Seq_Allele2, sep='>')]
 
@@ -121,7 +125,7 @@ trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignor
 
   #colOrderClasses = rep(c('C>A', 'C>G', 'C>T', 'T>A', 'T>C', 'T>G'), each = 16)
 
-  conv.mat = as.data.frame(cast(extract.tbl.summary, formula = Tumor_Sample_Barcode~SomaticMutationType, fill = 0, value = 'N'))
+  conv.mat = as.data.frame(data.table::dcast(extract.tbl.summary, formula = Tumor_Sample_Barcode~SomaticMutationType, fill = 0, value.var = 'N'))
   #head(conv.mat)
   rownames(conv.mat) = conv.mat[,1]
   #head(conv.mat)

diff --git a/R/addBases.R b/R/addBases.R
@@ -9,12 +9,15 @@
 #' @param n decompose matrix into n signatures. Default NULL. Tries to predict best value for \code{n} by running NMF on a range of values and chooses the one with highest cophenetic correlation coefficient.
 #' @param nTry tries upto this number of signatures before choosing best \code{n}. Default 6.
 #' @return returns a list with decomposed signatures and correlatoon table against validated signatures.
+#' @examples
+#' laml.sign <- extractSignatures(mat = laml.tnm)
+#' @import NMF
 #' @export
 
 
 extractSignatures = function(mat, n = NULL, nTry = 6){
 
-  require(NMF, quietly = T)
+  suppressPackageStartupMessages(require(NMF, quietly = T))
   #transpose matrix
   mat = t(mat)
 
@@ -35,7 +38,7 @@ extractSignatures = function(mat, n = NULL, nTry = 6){
   if(is.null(n)){
     message('Estimating best rank..')
     nmfTry = NMF::nmfEstimateRank(mat, seq(2,nTry), method='brunet', nrun=10, seed=123456) #try nmf for a range of values
-    print(plot(nmfTry, 'cophenetic' ))
+    #print(plot(nmfTry, 'cophenetic' ))
     nmf.sum = summary(nmfTry) # Get summary of estimates
     print(nmf.sum)
     bestFit = nmf.sum[which(nmf.sum$cophenetic == max(nmf.sum$cophenetic)),'rank'] #Get the best rank based on highest cophenetic correlation coefficient
@@ -52,7 +55,7 @@ extractSignatures = function(mat, n = NULL, nTry = 6){
   #conv.mat.nmf.signatures.melted = melt(conv.mat.nmf.signatures)
   #levels(conv.mat.nmf.signatures.melted$X1) = colOrder
 
-  sigs = fread(input = system.file('extdata', 'signatures.txt', package = 'maftools'), stringsAsFactors = F, data.table = F)
+  sigs = fread(input = system.file('extdata', 'signatures.txt', package = 'maftools'), stringsAsFactors = FALSE, data.table = FALSE)
   colnames(sigs) = gsub(pattern = ' ', replacement = '_', x = colnames(sigs))
   rownames(sigs) = sigs$Somatic_Mutation_Type
   sigs = sigs[,-c(1:3)]

diff --git a/R/addReadCounts.R b/R/addReadCounts.R
@@ -26,25 +26,25 @@ addReadCounts = function(maf, bam_df, BaseQuality = 10, MapQuality = 10, ref_gen
 
   if(is.null(ref_genome)){
     message("ref_genome required !")
-    stop(call. = T)
+    stop(call. = TRUE)
   }
 
   maf.dat = maf@data
-  
+
   #Some TCGA studies have Start_Position set to as 'position'. Change if so.
   if(length(grep(pattern = 'Start_position', x = colnames(maf.dat))) > 0){
     colnames(maf.dat)[which(colnames(maf.dat) == 'Start_position')] = 'Start_Position'
   }
-  
+
   if(length(grep(pattern = 'End_position', x = colnames(maf.dat))) > 0){
     colnames(maf.dat)[which(colnames(maf.dat) == 'End_position')] = 'End_Position'
   }
-  
+
   if(!is.null(prefix)){
     if(add){
       maf.dat$Chromosome = paste(prefix,maf.dat$Chromosome, sep = '')
     }else{
-      maf.dat$Chromosome = gsub(pattern = prefix, replacement = '', x = maf.dat$Chromosome, fixed = T)
+      maf.dat$Chromosome = gsub(pattern = prefix, replacement = '', x = maf.dat$Chromosome, fixed = TRUE)
     }
   }
 
@@ -91,10 +91,10 @@ addReadCounts = function(maf, bam_df, BaseQuality = 10, MapQuality = 10, ref_gen
       if(nrow(maf.snp) > 0){
 
         var = maf.snp[,c('Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele2')]
-        write.table(var, 'var_temp.txt', sep = '\t',quote = F, row.names = F, col.names = F)
+        write.table(var, 'var_temp.txt', sep = '\t',quote = FALSE, row.names = FALSE, col.names = FALSE)
         cat("Processing ", tsb, " [", nrow(var),  " SNPs] ..\n", sep = '')
         counts = getCounts(var = 'var_temp.txt', bam = bam_path, MapQuality = MapQuality, BaseQuality = BaseQuality, ref_genome = ref_genome)
-        maf.snp = merge(maf.snp, counts[,3:6], by = 'row.names', all.x = T)
+        maf.snp = merge(maf.snp, counts[,3:6], by = 'row.names', all.x = TRUE)
         maf.snp = maf.snp[,-1]
 
         if(nrow(maf.rest) > 0 ){

diff --git a/R/annovarToMaf.R b/R/annovarToMaf.R
@@ -20,19 +20,22 @@
 #' @param MAFobj If TRUE, returns results as an MAF object.
 #' @references Wang, K., Li, M. & Hakonarson, H. ANNOVAR: functional annotation of genetic variants from high-throughput sequencing data. Nucleic Acids Res 38, e164 (2010).
 #' @return MAF table.
+#' @examples
+#' var.annovar <- system.file("extdata", "variants.hg19_multianno.txt", package = "maftools")
+#' var.annovar.maf <- annovarToMaf(annovar = var.annovar, Center = 'CSI-NUS', refBuild = 'hg19', tsbCol = 'Tumor_Sample_Barcode', table = 'ensGene', header = T)
 #' @export
 
-annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL, table = 'refGene', basename = NULL, header = F, sep = '\t', MAFobj = F){
+annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL, table = 'refGene', basename = NULL, header = FALSE, sep = '\t', MAFobj = FALSE){
 
   if(header){
-    ann = fread(input = annovar, colClasses = 'character', sep = sep, stringsAsFactors = F, skip = 1, header = T)
+    ann = fread(input = annovar, colClasses = 'character', sep = sep, stringsAsFactors = FALSE, skip = 1, header = TRUE)
     if(table == 'ensGene'){
       colnames(ann)[1:10] = c('Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.ensGene', 'Gene.ensGene', 'GeneDetail.ensGene', 'ExonicFunc.ensGene', 'AAChange.ensGene')
     }else{
       colnames(ann)[1:10] = c('Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene', 'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene')
     }
   }else{
-    ann = fread(input = annovar, colClasses = 'character', sep = sep, stringsAsFactors = F, skip =1, header = F)
+    ann = fread(input = annovar, colClasses = 'character', sep = sep, stringsAsFactors = FALSE, skip =1, header = FALSE)
     if(table == 'ensGene'){
       colnames(ann)[1:10] = c('Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.ensGene', 'Gene.ensGene', 'GeneDetail.ensGene', 'ExonicFunc.ensGene', 'AAChange.ensGene')
     }else{
@@ -43,7 +46,7 @@ annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL
 
   #Check to see if input file contains sample names
   if(is.null(tsbCol)){
-    if(! 'Tumor_Sample_Barcode' %in% colnames(annovar)){
+    if(! 'Tumor_Sample_Barcode' %in% colnames(ann)){
       message('Available fields:')
       print(ann)
       stop('Tumor_Sample_Barcode field not found in input file. Use argument tsbCol to manually specifiy field name containing sample names/Tumor_Sample_Barcodes.')
@@ -83,12 +86,20 @@ annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL
   #Rest of the optional fields (later they will be attached to the maf file)
   ann.opt = colnames(ann)[!colnames(ann) %in% ann.mand]
   ann.opt = c(ann.opt, 'uid')
-  ann.opt = ann[,ann.opt,with = F]
+  ann.opt = ann[,ann.opt,with = FALSE]
 
-  ann = ann[,ann.mand,with = F]
+  ann = ann[,ann.mand,with = FALSE]
 
   ann$ExonicFunc.refGene = gsub(pattern = ' SNV', replacement = '', x = ann$ExonicFunc.refGene)
 
+  funcSpl = strsplit(x = as.character(ann$ExonicFunc.refGene), split = ';', fixed = TRUE)
+  funcSpl = sapply(funcSpl, function(l){l[length(l)]})
+  ann$ExonicFunc.refGene = funcSpl
+
+  funcRef = strsplit(x = as.character(ann$Func.refGene), split = ';', fixed = TRUE)
+  funcRef = sapply(funcRef, function(l){l[length(l)]})
+  ann$Func.refGene = funcRef
+
   #Change Variant Classification factors.
   ann$ExonicFunc.refGene = ifelse(test = ann$Func.refGene == 'intronic', yes = 'Intron', no = ann$ExonicFunc.refGene)
   ann$ExonicFunc.refGene = ifelse(test = ann$Func.refGene == 'intergenic', yes = 'IGR', no = ann$ExonicFunc.refGene)
@@ -127,19 +138,19 @@ annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL
     ann$var.type = 'SNP'
   }
 
-  ann = rbind(ann, ann.del, ann.ins, fill = T)
+  ann = rbind(ann, ann.del, ann.ins, fill = TRUE)
 
   ann.splice = ann[ExonicFunc.refGene == 'Splice_Site']
   if(nrow(ann.splice) > 0){
     ann = ann[ExonicFunc.refGene != 'Splice_Site']
     #ann.splice$AAChange.refGene = gsub(x = sapply(strsplit(x = as.character(ann.splice$Gene.refGene), split = '(', fixed = T), '[[',2), pattern = ')$', replacement = '')
-    ann.splice$Gene.refGene = sapply(strsplit(x = as.character(ann.splice$Gene.refGene), split = '(', fixed = T), '[[',1)
-    ann = rbind(ann, ann.splice, fill = T)
+    ann.splice$Gene.refGene = sapply(strsplit(x = as.character(ann.splice$Gene.refGene), split = '(', fixed = TRUE), '[[',1)
+    ann = rbind(ann, ann.splice, fill = TRUE)
   }
 
   #protein and tx changes
   #NOTE: for now last transcript is considered from the total annotaion.
-  xaa = strsplit(as.character(ann$AAChange.refGene),split = ':',fixed = T)
+  xaa = strsplit(as.character(ann$AAChange.refGene),split = ':',fixed = TRUE)
   proteinChange = sapply(xaa, function(l){l[length(l)]})
   tx = unlist(sapply(xaa, function(l){l[length(l)-3]}))
   txChange = unlist(sapply(xaa, function(l){if(length(l) > 1){l[length(l)-1]} else{NA}}))
@@ -159,21 +170,21 @@ annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL
     message('Converting Ensemble Gene IDs into HGNC gene symbols.')
     if(Sys.info()[['sysname']] == 'Windows'){
       ens.gz = gzfile(description = ens, open = 'r')
-      ens <- suppressWarnings( data.table(read.csv( file = gff.gz, header = T, sep = '\t', stringsAsFactors = F)) )
+      ens <- suppressWarnings( data.table(read.csv( file = gff.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) )
       close(ens.gz)
     } else{
-      ens = fread(input = paste('zcat <', ens), sep = '\t', stringsAsFactors = F)
+      ens = fread(input = paste('zcat <', ens), sep = '\t', stringsAsFactors = FALSE)
     }
 
-    ann.maf = merge(ann.maf, ens, by.x = 'Hugo_Symbol', by.y = 'ens_id', all.x = T)
+    ann.maf = merge(ann.maf, ens, by.x = 'Hugo_Symbol', by.y = 'ens_id', all.x = TRUE)
     ann.maf[,ens_id := Hugo_Symbol] #Backup original ids
     ann.maf[,Hugo_Symbol := hgnc_symbol] #Add GHNC gene names
     ann.maf[,Entrez_Gene_Id := Entrez] #Add entrez identifiers.
     message('Done! Original ensemble gene IDs are preserved under field name ens_id')
   }
 
   if(!is.null(basename)){
-    write.table(x = ann.maf, file = paste(basename, 'maf', sep='.'), sep='\t', quote = F, row.names = F)
+    write.table(x = ann.maf, file = paste(basename, 'maf', sep='.'), sep='\t', quote = FALSE, row.names = FALSE)
   }
 
   if(MAFobj){
@@ -182,15 +193,20 @@ annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL
       message('Too few samples to create MAF object. Returning MAF table.')
       return(ann.maf)
     }else{
-      ann.maf.onomat = createOncoMatrix(maf = ann.maf)
+      #Convert to factors.
+      ann.maf$Tumor_Sample_Barcode = as.factor(x = ann.maf$Hugo_Symbol)
+      ann.maf$Variant_Classification = as.factor(x = ann.maf$Variant_Classification)
+      ann.maf$Variant_Type = as.factor(x = ann.maf$Variant_Type)
+
+      ann.maf.oncomat = createOncoMatrix(maf = ann.maf)
 
       silent = c("Silent", "Intron", "RNA", "3'UTR", "3'Flank", "5'UTR", "5'Flank", "IGR")
       ann.maf.silent = ann.maf[Variant_Classification %in% silent]
 
       m = MAF(data = ann.maf, variants.per.sample = ann.maf.summary$variants.per.sample, variant.type.summary = ann.maf.summary$variant.type.summary,
               variant.classification.summary = ann.maf.summary$variant.classification.summary, gene.summary = ann.maf.summary$gene.summary,
-              oncoMatrix = ann.maf.onomat$oncomat, numericMatrix = ann.maf.onomat$nummat, summary = ann.maf.summary$summary,
-              classCode = ann.maf.onomat$vc, maf.silent = ann.maf.silent)
+              oncoMatrix = ann.maf.oncomat$oncomat, numericMatrix = ann.maf.oncomat$nummat, summary = ann.maf.summary$summary,
+              classCode = ann.maf.oncomat$vc, maf.silent = ann.maf.silent)
 
       return(m)
     }