I don't know how to commit

zwbao · Mar 6, 2016 · c048257 · c048257
1 parent 6204143
commit c048257
Show file tree

Hide file tree

Showing 32 changed files with 206,854 additions and 0 deletions.
diff --git a/.Rhistory b/.Rhistory
diff --git a/8-DEG/BCV.png b/8-DEG/BCV.png
diff --git a/8-DEG/DEG_gseaInput.R b/8-DEG/DEG_gseaInput.R
@@ -0,0 +1,168 @@
+#suppressMessages(library(affy))
+#suppressMessages(library(limma))
+library(DESeq)
+library(limma)
+library(DESeq2)
+library(edgeR)
+
+suppressMessages(library(optparse))
+txt_output=T ## whether need txt or not ?
+
+option_list = list(
+	make_option(c("-g", "--gct"),help="The full path of gct format file "),
+	make_option(c("-c", "--cls"),help="The full path of cls format file "),
+	make_option(c("-m", "--method"),default="limma",help="The full path of cls format file "),
+)
+opt = parse_args(OptionParser(option_list=option_list))
+if(is.null(opt$gct) || is.null(opt$cls)){
+   stop(sprintf("The gct and cls files is required !!!"))
+}
+cls_file=opt$gct
+gct_file=opt$cls
+DEG_method=tolower(opt$method)
+TEST=T
+if (TEST){
+	cls_file="P53.cls"
+	gct_file="P53_hgu95av2.gct"
+	DEG_method='limma'
+}
+studyID=sub(".gct","",gct_file)
+rmDupID <-function(a=matrix(c(1,1:5,2,2:6,2,3:7),ncol=6)){
+  exprSet=a[,-1]
+  rowMeans=apply(exprSet,1,mean)
+  a=a[order(rowMeans,decreasing=T),]
+  return(a[!duplicated(a[,1]),])
+}
+
+cat(paste("step1:",Sys.time(),"start to do DEG !!! \n"))
+a=read.table(gct_file,stringsAsFactors=F,skip=2,header=T,sep="\t")
+## probeID/geneID/value1/value2~~~~~~
+exprSet=a[,-1]
+exprSet=rmDupID(exprSet)
+rownames(exprSet)=exprSet[,1]
+exprSet=exprSet[,-1]
+sample_list=colnames(exprSet)
+b=read.table(cls_file,stringsAsFactors=F,skip=2)
+group_list=as.character(b)
+sample_group=data.frame(sample_list,group_list)
+names(sample_group)=c('sample_id','compare')
+
+##liver; males and femlaes human pubmed ID: 	20009012
+phenodata=read.table("http://bowtie-bio.sourceforge.net/recount/phenotypeTables/gilad_phenodata.txt",header=T)
+countdata=read.table("http://bowtie-bio.sourceforge.net/recount/countTables/gilad_count_table.txt",header = T,sep = "\t")
+rmDupID <-function(a=matrix(c(1,1:5,2,2:6,2,3:7),ncol=6)){
+  exprSet=a[,-1]
+  rowMeans=apply(exprSet,1,mean)
+  a=a[order(rowMeans,decreasing=T),]
+  return(a[!duplicated(a[,1]),])
+}
+exprSet=rmDupID(countdata)
+rownames(exprSet)=exprSet[,1]
+exprSet=exprSet[,-1]
+sample_list=colnames(exprSet)
+group_list=as.character(phenodata[,3])
+
+DEG_results=DEG_voom(exprSet,group_list)
+DEG_results=DEG_DESeq2(exprSet,group_list)
+
+
+if(DEG_method=='limma'){
+	DEG_results=DEG_limma(exprSet,group_list)		
+}else if (DEG_method=='voom'){
+	DEG_results=DEG_voom(exprSet,group_list)
+}else if (DEG_method=='deseq'){
+	DEG_results=DEG_DESeq(exprSet,group_list)
+}else if (DEG_method=='deseq2'){
+	DEG_results=DEG_DESeq2(exprSet,group_list)
+}else if (DEG_method=='edger'){
+	DEG_results=DEG_edger(exprSet,group_list)
+}else{
+	stop(sprintf("we don't offer the method you want"))
+}
+
+
+DEG_edger <- function(exprSet=exprSet,group_list=group_list){
+	suppressMessages(library(edgeR)	
+	d <- DGEList(counts=exprSet,group=factor(group_list))
+	d.full <- d # keep the old one in case we mess up
+	apply(d$counts, 2, sum) # total gene counts per samplekeep <- rowSums(cpm(d)>100) >= 2
+	d <- d[keep,]
+	d$samples$lib.size <- colSums(d$counts)
+	d <- calcNormFactors(d)
+	png("MDS.png")
+	plotMDS(d, method="bcv", col=as.numeric(d$samples$group))
+	legend("bottomleft", as.character(unique(d$samples$group)), col=1:3, pch=20)
+	dev.off()
+	d1 <- estimateCommonDisp(d, verbose=T)
+	d1 <- estimateTagwiseDisp(d1)
+	et12 <- exactTest(d1, pair=c(1,2)) 
+	png("MA.png")
+	de1 <- decideTestsDGE(et12, adjust.method="BH", p.value=0.05)
+	de1tags12 <- rownames(d1)[as.logical(de1)] 
+	plotSmear(et12, de.tags=de1tags12)
+	abline(h = c(-2, 2), col = "blue")
+	dev.off()
+	topTags(et12, n=10)
+}
+
+DEG_limma <- function(exprSet=exprSet,group_list=group_list){
+	suppressMessages(library(limma))
+	design <- model.matrix(~0+factor(group_list))
+	colnames(design)=levels(factor(group_list))
+	rownames(design)=colnames(exprSet)
+	fit <- lmFit(exprSet,design)
+	png("MA.png")
+	plotMA(fit)
+	abline(0,0,col="blue")
+	dev.off()
+	fit2 <- eBayes(fit)
+	tempOutput = topTable(fit2, coef=1, n=Inf)
+	nrDEG2 = na.omit(tempOutput) 
+}
+
+DEG_voom <- function(exprSet=exprSet,group_list=group_list){
+	suppressMessages(library(limma))
+	design <- model.matrix(~0+factor(group_list))
+	colnames(design)=levels(factor(group_list))
+	rownames(design)=colnames(exprSet)
+	v <- voom(exprSet,design,normalize="quantile")
+	png("MDS.png")
+	plotMDS(v, labels=1:ncol(exprSet),col=rainbow(ncol(exprSet)))
+	#abline(0,0,col="blue")
+	dev.off()
+	fit <- lmFit(v,design)
+	png("MA.png")
+	plotMA(fit)
+	#abline(0,0,col="blue")
+	dev.off()
+	fit2 <- eBayes(fit)
+	tempOutput = topTable(fit2, coef=1, n=Inf)
+	nrDEG2 = na.omit(tempOutput)	
+}
+
+DEG_DESeq2 <- function(exprSet=exprSet,group_list=group_list){
+	suppressMessages(library(DESeq2))
+	exprSet=ceiling(exprSet)
+	(colData <- data.frame(row.names=colnames(exprSet), group_list=group_list))
+	dds <- DESeqDataSetFromMatrix(countData = exprSet,
+								colData = colData,
+								design = ~ group_list)
+	dds <- DESeq(dds)
+	res <- results(dds)
+	png("MA.png")
+	#plotMA(res, main="DESeq2", ylim=c(-2,2))
+	dev.off()
+	resOrdered <- res[order(res$padj),]
+	resOrdered=as.data.frame(resOrdered)
+}
+
+DEG_DESeq <- function(exprSet=exprSet,group_list=group_list){
+	suppressMessages(library(DESeq))
+	exprSet=ceiling(exprSet)
+	de=newCountDataSet(exprSet,group_list)
+	de=estimateSizeFactors(de)
+	de=estimateDispersions(de)
+	res=nbinomTest(de,"case","control")
+	rownames(res)=res[,1]
+	res=res[,-1]
+}
diff --git a/8-DEG/DESeq2/DESeq2.pdf b/8-DEG/DESeq2/DESeq2.pdf
diff --git a/8-DEG/DESeq2/github-R-code.txt b/8-DEG/DESeq2/github-R-code.txt
@@ -0,0 +1,151 @@
+## RNA-seq analysis with DESeq2
+## Stephen Turner, @genetics_blog
+
+# RNA-seq data from GSE52202
+# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse52202. All patients with
+# ALS, 4 with C9 expansion ("exp"), 4 controls without expansion ("ctl")
+
+# Import & pre-process ----------------------------------------------------
+
+# Import data from featureCounts
+## Previously ran at command line something like this:
+## featureCounts -a genes.gtf -o counts.txt -T 12 -t exon -g gene_id GSM*.sam
+countdata <- read.table("counts.txt", header=TRUE, row.names=1)
+
+# Remove first five columns (chr, start, end, strand, length)
+countdata <- countdata[ ,6:ncol(countdata)]
+
+# Remove .bam or .sam from filenames
+colnames(countdata) <- gsub("\\.[sb]am$", "", colnames(countdata))
+
+# Convert to matrix
+countdata <- as.matrix(countdata)
+head(countdata)
+
+# Assign condition (first four are controls, second four contain the expansion)
+(condition <- factor(c(rep("ctl", 4), rep("exp", 4))))
+
+# Analysis with DESeq2 ----------------------------------------------------
+
+library(DESeq2)
+
+# Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix
+(coldata <- data.frame(row.names=colnames(countdata), condition))
+dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
+dds
+
+# Run the DESeq pipeline
+dds <- DESeq(dds)
+
+# Plot dispersions
+png("qc-dispersions.png", 1000, 1000, pointsize=20)
+plotDispEsts(dds, main="Dispersion plot")
+dev.off()
+
+# Regularized log transformation for clustering/heatmaps, etc
+rld <- rlogTransformation(dds)
+head(assay(rld))
+hist(assay(rld))
+
+# Colors for plots below
+## Ugly:
+## (mycols <- 1:length(unique(condition)))
+## Use RColorBrewer, better
+library(RColorBrewer)
+(mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))])
+
+# Sample distance heatmap
+sampleDists <- as.matrix(dist(t(assay(rld))))
+library(gplots)
+png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20)
+heatmap.2(as.matrix(sampleDists), key=F, trace="none",
+          col=colorpanel(100, "black", "white"),
+          ColSideColors=mycols[condition], RowSideColors=mycols[condition],
+          margin=c(10, 10), main="Sample Distance Matrix")
+dev.off()
+
+# Principal components analysis
+## Could do with built-in DESeq2 function:
+## DESeq2::plotPCA(rld, intgroup="condition")
+## I like mine better:
+rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) {
+  require(genefilter)
+  require(calibrate)
+  require(RColorBrewer)
+  rv = rowVars(assay(rld))
+  select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
+  pca = prcomp(t(assay(rld)[select, ]))
+  fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : "))
+  if (is.null(colors)) {
+    if (nlevels(fac) >= 3) {
+      colors = brewer.pal(nlevels(fac), "Paired")
+    }   else {
+      colors = c("black", "red")
+    }
+  }
+  pc1var <- round(summary(pca)$importance[2,1]*100, digits=1)
+  pc2var <- round(summary(pca)$importance[2,2]*100, digits=1)
+  pc1lab <- paste0("PC1 (",as.character(pc1var),"%)")
+  pc2lab <- paste0("PC1 (",as.character(pc2var),"%)")
+  plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...)
+  with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx))
+  legend(legendpos, legend=levels(fac), col=colors, pch=20)
+  #     rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld),
+  #            pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours),
+  #                                                                                         terldt = list(levels(fac)), rep = FALSE)))
+}
+png("qc-pca.png", 1000, 1000, pointsize=20)
+rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-75, 35))
+dev.off()
+
+
+# Get differential expression results
+res <- results(dds)
+table(res$padj<0.05)
+## Order by adjusted p-value
+res <- res[order(res$padj), ]
+## Merge with normalized count data
+resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
+names(resdata)[1] <- "Gene"
+head(resdata)
+## Write results
+write.csv(resdata, file="diffexpr-results.csv")
+
+## Examine plot of p-values
+hist(res$pvalue, breaks=50, col="grey")
+
+## Examine independent filtering
+attr(res, "filterThreshold")
+plot(attr(res,"filterNumRej"), type="b", xlab="quantiles of baseMean", ylab="number of rejections")
+
+## MA plot
+## Could do with built-in DESeq2 function:
+## DESeq2::plotMA(dds, ylim=c(-1,1), cex=1)
+## I like mine better:
+maplot <- function (res, thresh=0.05, labelsig=TRUE, textcx=1, ...) {
+  with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...))
+  with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
+  if (labelsig) {
+    require(calibrate)
+    with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
+  }
+}
+png("diffexpr-maplot.png", 1500, 1000, pointsize=20)
+maplot(resdata, main="MA Plot")
+dev.off()
+
+## Volcano plot with "significant" genes labeled
+volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
+  with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
+  with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
+  with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
+  with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
+  if (labelsig) {
+    require(calibrate)
+    with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
+  }
+  legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green"))
+}
+png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20)
+volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2))
+dev.off()