Skip to content

Commit

Permalink
add vignette, biocCheck, cleanup code
Browse files Browse the repository at this point in the history
  • Loading branch information
PoisonAlien committed Apr 2, 2016
1 parent 360aaf4 commit 3094327
Show file tree
Hide file tree
Showing 59 changed files with 1,253 additions and 281 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.Rproj.user
proj.user
.Rhistory
.RData
inst/doc
.Rproj.user
20 changes: 12 additions & 8 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,27 +1,31 @@
Package: maftools
Type: Package
Title: maftools : Summarize, Analyze and Visualize MAF files.
Version: 0.1
Version: 0.99.0
Date: 2015-12-14
Author: Anand M <<anand_mt@hotmail.com>>
Maintainer: Anand M <anand_mt@hotmail.com>
Author: Anand Mayakonda <<anand_mt@hotmail.com>>
Maintainer: Anand Mayakonda <anand_mt@hotmail.com>
Description: Process MAF files.
URL: https://github.com/PoisonAlien/maftools
License: MIT
LazyData: TRUE
Depends: R (>= 3.1.2), data.table, ggplot2
Depends: R (>= 3.2), data.table, ggplot2
Imports:
plyr,
reshape,
cowplot,
cometExactTest,
RColorBrewer,
dplyr,
gplots,
NMF,
ggrepel,
methods
methods,
ComplexHeatmap,
mclust,
VariantAnnotation,
Biostrings
RoxygenNote: 5.0.1
Suggests: knitr,
rmarkdown
rmarkdown,
testthat
VignetteBuilder: knitr
biocViews: DataRepresentation, DNASeq, Visualization, FeatureExtraction, Classification, SomaticMutation, Sequencing
15 changes: 14 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export(math.score)
export(mutExclusive)
export(oncodrive)
export(oncoplot)
export(oncoprint)
export(oncostrip)
export(oncotate)
export(pfamDomains)
export(plotOncodrive)
Expand All @@ -25,4 +25,17 @@ export(write.mafSummary)
exportClasses(MAF)
exportMethods(getGeneSummary)
exportMethods(getSampleSummary)
import(ComplexHeatmap)
import(NMF)
import(RColorBrewer)
import(cometExactTest)
import(cowplot)
import(data.table)
import(ggrepel)
import(gplots)
import(mclust)
import(methods)
importFrom(Biostrings,subseq)
importFrom(VariantAnnotation,getSeq)
importFrom(VariantAnnotation,seqlevels)
importFrom(dplyr,filter)
24 changes: 14 additions & 10 deletions R/TrinucleotideMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@
#' @param ignoreChr Chromsomes to remove from analysis. e.g. chrM
#' @param useSyn Logical. Whether to include synonymous variants in analysis. Defaults to FALSE.
#' @return A matrix of dimension nx96, where n is the number of samples in the MAF.
#' @examples
#' laml.tnm <- trinucleotideMatrix(maf = laml, ref_genome = 'hg19.fa', prefix = 'chr', add = T, useSyn = T)
#' @importFrom VariantAnnotation getSeq seqlevels
#' @importFrom Biostrings subseq
#' @export

trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignoreChr = NULL, useSyn = F){
trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignoreChr = NULL, useSyn = FALSE){

suppressPackageStartupMessages(require('VariantAnnotation', quietly = T))
suppressPackageStartupMessages(require('Biostrings', quietly = T))
#suppressPackageStartupMessages(require('VariantAnnotation', quietly = TRUE))
#suppressPackageStartupMessages(require('Biostrings', quietly = TRUE))

#Synonymous variants
maf.silent = maf@maf.silent
Expand All @@ -26,20 +30,20 @@ trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignor
maf = maf[!Variant_Classification %in% silent] #Remove silent variants from main table

if(useSyn){
maf = rbind(maf, maf.silent, fill = T)
maf = rbind(maf, maf.silent, fill = TRUE)
}

#one bp up and down.
up = down = 1

#reate a reference to an indexed fasta file.
ref = FaFile(file = ref_genome)
ref = Rsamtools::FaFile(file = ref_genome)

if(!is.null(prefix)){
if(add){
maf$Chromosome = paste(prefix,maf$Chromosome, sep = '')
}else{
maf$Chromosome = gsub(pattern = prefix, replacement = '', x = maf$Chromosome, fixed = T)
maf$Chromosome = gsub(pattern = prefix, replacement = '', x = maf$Chromosome, fixed = TRUE)
}
}

Expand All @@ -66,10 +70,10 @@ trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignor

#read fasta
message('reading fasta (this might take a while)..')
ref = getSeq(x = ref)
ref = VariantAnnotation::getSeq(x = ref)

#extract contigs from reference fasta
seq.lvl = seqlevels(ref)
seq.lvl = VariantAnnotation::seqlevels(ref)

chrs.missing = chrs[!chrs %in% seq.lvl]

Expand All @@ -89,7 +93,7 @@ trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignor
Reference_Allele = maf.snp$Reference_Allele, Tumor_Seq_Allele2 = maf.snp$Tumor_Seq_Allele2, Tumor_Sample_Barcode = maf.snp$Tumor_Sample_Barcode)

message('Extracting adjacent bases..')
ss = subseq(x = ref[extract.tbl[,Chromosome]], start = extract.tbl[,Start] , end = extract.tbl[,End])
ss = Biostrings::subseq(x = ref[extract.tbl[,Chromosome]], start = extract.tbl[,Start] , end = extract.tbl[,End])
extract.tbl[,trinucleotide:= as.character(ss)]
extract.tbl[,Substitution:=paste(extract.tbl$Reference_Allele, extract.tbl$Tumor_Seq_Allele2, sep='>')]

Expand Down Expand Up @@ -121,7 +125,7 @@ trinucleotideMatrix = function(maf, ref_genome, prefix = NULL, add = TRUE, ignor

#colOrderClasses = rep(c('C>A', 'C>G', 'C>T', 'T>A', 'T>C', 'T>G'), each = 16)

conv.mat = as.data.frame(cast(extract.tbl.summary, formula = Tumor_Sample_Barcode~SomaticMutationType, fill = 0, value = 'N'))
conv.mat = as.data.frame(data.table::dcast(extract.tbl.summary, formula = Tumor_Sample_Barcode~SomaticMutationType, fill = 0, value.var = 'N'))
#head(conv.mat)
rownames(conv.mat) = conv.mat[,1]
#head(conv.mat)
Expand Down
9 changes: 6 additions & 3 deletions R/addBases.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
#' @param n decompose matrix into n signatures. Default NULL. Tries to predict best value for \code{n} by running NMF on a range of values and chooses the one with highest cophenetic correlation coefficient.
#' @param nTry tries upto this number of signatures before choosing best \code{n}. Default 6.
#' @return returns a list with decomposed signatures and correlatoon table against validated signatures.
#' @examples
#' laml.sign <- extractSignatures(mat = laml.tnm)
#' @import NMF
#' @export


extractSignatures = function(mat, n = NULL, nTry = 6){

require(NMF, quietly = T)
suppressPackageStartupMessages(require(NMF, quietly = T))
#transpose matrix
mat = t(mat)

Expand All @@ -35,7 +38,7 @@ extractSignatures = function(mat, n = NULL, nTry = 6){
if(is.null(n)){
message('Estimating best rank..')
nmfTry = NMF::nmfEstimateRank(mat, seq(2,nTry), method='brunet', nrun=10, seed=123456) #try nmf for a range of values
print(plot(nmfTry, 'cophenetic' ))
#print(plot(nmfTry, 'cophenetic' ))
nmf.sum = summary(nmfTry) # Get summary of estimates
print(nmf.sum)
bestFit = nmf.sum[which(nmf.sum$cophenetic == max(nmf.sum$cophenetic)),'rank'] #Get the best rank based on highest cophenetic correlation coefficient
Expand All @@ -52,7 +55,7 @@ extractSignatures = function(mat, n = NULL, nTry = 6){
#conv.mat.nmf.signatures.melted = melt(conv.mat.nmf.signatures)
#levels(conv.mat.nmf.signatures.melted$X1) = colOrder

sigs = fread(input = system.file('extdata', 'signatures.txt', package = 'maftools'), stringsAsFactors = F, data.table = F)
sigs = fread(input = system.file('extdata', 'signatures.txt', package = 'maftools'), stringsAsFactors = FALSE, data.table = FALSE)
colnames(sigs) = gsub(pattern = ' ', replacement = '_', x = colnames(sigs))
rownames(sigs) = sigs$Somatic_Mutation_Type
sigs = sigs[,-c(1:3)]
Expand Down
14 changes: 7 additions & 7 deletions R/addReadCounts.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,25 +26,25 @@ addReadCounts = function(maf, bam_df, BaseQuality = 10, MapQuality = 10, ref_gen

if(is.null(ref_genome)){
message("ref_genome required !")
stop(call. = T)
stop(call. = TRUE)
}

maf.dat = maf@data

#Some TCGA studies have Start_Position set to as 'position'. Change if so.
if(length(grep(pattern = 'Start_position', x = colnames(maf.dat))) > 0){
colnames(maf.dat)[which(colnames(maf.dat) == 'Start_position')] = 'Start_Position'
}

if(length(grep(pattern = 'End_position', x = colnames(maf.dat))) > 0){
colnames(maf.dat)[which(colnames(maf.dat) == 'End_position')] = 'End_Position'
}

if(!is.null(prefix)){
if(add){
maf.dat$Chromosome = paste(prefix,maf.dat$Chromosome, sep = '')
}else{
maf.dat$Chromosome = gsub(pattern = prefix, replacement = '', x = maf.dat$Chromosome, fixed = T)
maf.dat$Chromosome = gsub(pattern = prefix, replacement = '', x = maf.dat$Chromosome, fixed = TRUE)
}
}

Expand Down Expand Up @@ -91,10 +91,10 @@ addReadCounts = function(maf, bam_df, BaseQuality = 10, MapQuality = 10, ref_gen
if(nrow(maf.snp) > 0){

var = maf.snp[,c('Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele2')]
write.table(var, 'var_temp.txt', sep = '\t',quote = F, row.names = F, col.names = F)
write.table(var, 'var_temp.txt', sep = '\t',quote = FALSE, row.names = FALSE, col.names = FALSE)
cat("Processing ", tsb, " [", nrow(var), " SNPs] ..\n", sep = '')
counts = getCounts(var = 'var_temp.txt', bam = bam_path, MapQuality = MapQuality, BaseQuality = BaseQuality, ref_genome = ref_genome)
maf.snp = merge(maf.snp, counts[,3:6], by = 'row.names', all.x = T)
maf.snp = merge(maf.snp, counts[,3:6], by = 'row.names', all.x = TRUE)
maf.snp = maf.snp[,-1]

if(nrow(maf.rest) > 0 ){
Expand Down
50 changes: 33 additions & 17 deletions R/annovarToMaf.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,22 @@
#' @param MAFobj If TRUE, returns results as an MAF object.
#' @references Wang, K., Li, M. & Hakonarson, H. ANNOVAR: functional annotation of genetic variants from high-throughput sequencing data. Nucleic Acids Res 38, e164 (2010).
#' @return MAF table.
#' @examples
#' var.annovar <- system.file("extdata", "variants.hg19_multianno.txt", package = "maftools")
#' var.annovar.maf <- annovarToMaf(annovar = var.annovar, Center = 'CSI-NUS', refBuild = 'hg19', tsbCol = 'Tumor_Sample_Barcode', table = 'ensGene', header = T)
#' @export

annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL, table = 'refGene', basename = NULL, header = F, sep = '\t', MAFobj = F){
annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL, table = 'refGene', basename = NULL, header = FALSE, sep = '\t', MAFobj = FALSE){

if(header){
ann = fread(input = annovar, colClasses = 'character', sep = sep, stringsAsFactors = F, skip = 1, header = T)
ann = fread(input = annovar, colClasses = 'character', sep = sep, stringsAsFactors = FALSE, skip = 1, header = TRUE)
if(table == 'ensGene'){
colnames(ann)[1:10] = c('Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.ensGene', 'Gene.ensGene', 'GeneDetail.ensGene', 'ExonicFunc.ensGene', 'AAChange.ensGene')
}else{
colnames(ann)[1:10] = c('Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene', 'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene')
}
}else{
ann = fread(input = annovar, colClasses = 'character', sep = sep, stringsAsFactors = F, skip =1, header = F)
ann = fread(input = annovar, colClasses = 'character', sep = sep, stringsAsFactors = FALSE, skip =1, header = FALSE)
if(table == 'ensGene'){
colnames(ann)[1:10] = c('Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.ensGene', 'Gene.ensGene', 'GeneDetail.ensGene', 'ExonicFunc.ensGene', 'AAChange.ensGene')
}else{
Expand All @@ -43,7 +46,7 @@ annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL

#Check to see if input file contains sample names
if(is.null(tsbCol)){
if(! 'Tumor_Sample_Barcode' %in% colnames(annovar)){
if(! 'Tumor_Sample_Barcode' %in% colnames(ann)){
message('Available fields:')
print(ann)
stop('Tumor_Sample_Barcode field not found in input file. Use argument tsbCol to manually specifiy field name containing sample names/Tumor_Sample_Barcodes.')
Expand Down Expand Up @@ -83,12 +86,20 @@ annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL
#Rest of the optional fields (later they will be attached to the maf file)
ann.opt = colnames(ann)[!colnames(ann) %in% ann.mand]
ann.opt = c(ann.opt, 'uid')
ann.opt = ann[,ann.opt,with = F]
ann.opt = ann[,ann.opt,with = FALSE]

ann = ann[,ann.mand,with = F]
ann = ann[,ann.mand,with = FALSE]

ann$ExonicFunc.refGene = gsub(pattern = ' SNV', replacement = '', x = ann$ExonicFunc.refGene)

funcSpl = strsplit(x = as.character(ann$ExonicFunc.refGene), split = ';', fixed = TRUE)
funcSpl = sapply(funcSpl, function(l){l[length(l)]})
ann$ExonicFunc.refGene = funcSpl

funcRef = strsplit(x = as.character(ann$Func.refGene), split = ';', fixed = TRUE)
funcRef = sapply(funcRef, function(l){l[length(l)]})
ann$Func.refGene = funcRef

#Change Variant Classification factors.
ann$ExonicFunc.refGene = ifelse(test = ann$Func.refGene == 'intronic', yes = 'Intron', no = ann$ExonicFunc.refGene)
ann$ExonicFunc.refGene = ifelse(test = ann$Func.refGene == 'intergenic', yes = 'IGR', no = ann$ExonicFunc.refGene)
Expand Down Expand Up @@ -127,19 +138,19 @@ annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL
ann$var.type = 'SNP'
}

ann = rbind(ann, ann.del, ann.ins, fill = T)
ann = rbind(ann, ann.del, ann.ins, fill = TRUE)

ann.splice = ann[ExonicFunc.refGene == 'Splice_Site']
if(nrow(ann.splice) > 0){
ann = ann[ExonicFunc.refGene != 'Splice_Site']
#ann.splice$AAChange.refGene = gsub(x = sapply(strsplit(x = as.character(ann.splice$Gene.refGene), split = '(', fixed = T), '[[',2), pattern = ')$', replacement = '')
ann.splice$Gene.refGene = sapply(strsplit(x = as.character(ann.splice$Gene.refGene), split = '(', fixed = T), '[[',1)
ann = rbind(ann, ann.splice, fill = T)
ann.splice$Gene.refGene = sapply(strsplit(x = as.character(ann.splice$Gene.refGene), split = '(', fixed = TRUE), '[[',1)
ann = rbind(ann, ann.splice, fill = TRUE)
}

#protein and tx changes
#NOTE: for now last transcript is considered from the total annotaion.
xaa = strsplit(as.character(ann$AAChange.refGene),split = ':',fixed = T)
xaa = strsplit(as.character(ann$AAChange.refGene),split = ':',fixed = TRUE)
proteinChange = sapply(xaa, function(l){l[length(l)]})
tx = unlist(sapply(xaa, function(l){l[length(l)-3]}))
txChange = unlist(sapply(xaa, function(l){if(length(l) > 1){l[length(l)-1]} else{NA}}))
Expand All @@ -159,21 +170,21 @@ annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL
message('Converting Ensemble Gene IDs into HGNC gene symbols.')
if(Sys.info()[['sysname']] == 'Windows'){
ens.gz = gzfile(description = ens, open = 'r')
ens <- suppressWarnings( data.table(read.csv( file = gff.gz, header = T, sep = '\t', stringsAsFactors = F)) )
ens <- suppressWarnings( data.table(read.csv( file = gff.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) )
close(ens.gz)
} else{
ens = fread(input = paste('zcat <', ens), sep = '\t', stringsAsFactors = F)
ens = fread(input = paste('zcat <', ens), sep = '\t', stringsAsFactors = FALSE)
}

ann.maf = merge(ann.maf, ens, by.x = 'Hugo_Symbol', by.y = 'ens_id', all.x = T)
ann.maf = merge(ann.maf, ens, by.x = 'Hugo_Symbol', by.y = 'ens_id', all.x = TRUE)
ann.maf[,ens_id := Hugo_Symbol] #Backup original ids
ann.maf[,Hugo_Symbol := hgnc_symbol] #Add GHNC gene names
ann.maf[,Entrez_Gene_Id := Entrez] #Add entrez identifiers.
message('Done! Original ensemble gene IDs are preserved under field name ens_id')
}

if(!is.null(basename)){
write.table(x = ann.maf, file = paste(basename, 'maf', sep='.'), sep='\t', quote = F, row.names = F)
write.table(x = ann.maf, file = paste(basename, 'maf', sep='.'), sep='\t', quote = FALSE, row.names = FALSE)
}

if(MAFobj){
Expand All @@ -182,15 +193,20 @@ annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL
message('Too few samples to create MAF object. Returning MAF table.')
return(ann.maf)
}else{
ann.maf.onomat = createOncoMatrix(maf = ann.maf)
#Convert to factors.
ann.maf$Tumor_Sample_Barcode = as.factor(x = ann.maf$Hugo_Symbol)
ann.maf$Variant_Classification = as.factor(x = ann.maf$Variant_Classification)
ann.maf$Variant_Type = as.factor(x = ann.maf$Variant_Type)

ann.maf.oncomat = createOncoMatrix(maf = ann.maf)

silent = c("Silent", "Intron", "RNA", "3'UTR", "3'Flank", "5'UTR", "5'Flank", "IGR")
ann.maf.silent = ann.maf[Variant_Classification %in% silent]

m = MAF(data = ann.maf, variants.per.sample = ann.maf.summary$variants.per.sample, variant.type.summary = ann.maf.summary$variant.type.summary,
variant.classification.summary = ann.maf.summary$variant.classification.summary, gene.summary = ann.maf.summary$gene.summary,
oncoMatrix = ann.maf.onomat$oncomat, numericMatrix = ann.maf.onomat$nummat, summary = ann.maf.summary$summary,
classCode = ann.maf.onomat$vc, maf.silent = ann.maf.silent)
oncoMatrix = ann.maf.oncomat$oncomat, numericMatrix = ann.maf.oncomat$nummat, summary = ann.maf.summary$summary,
classCode = ann.maf.oncomat$vc, maf.silent = ann.maf.silent)

return(m)
}
Expand Down
Loading

0 comments on commit 3094327

Please sign in to comment.