Skip to content

Commit

Permalink
I don't know how to commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jmzeng1314 committed Mar 6, 2016
1 parent 6204143 commit c048257
Show file tree
Hide file tree
Showing 32 changed files with 206,854 additions and 0 deletions.
512 changes: 512 additions & 0 deletions .Rhistory

Large diffs are not rendered by default.

Binary file added 8-DEG/BCV.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
168 changes: 168 additions & 0 deletions 8-DEG/DEG_gseaInput.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#suppressMessages(library(affy))
#suppressMessages(library(limma))
library(DESeq)
library(limma)
library(DESeq2)
library(edgeR)

suppressMessages(library(optparse))
txt_output=T ## whether need txt or not ?

option_list = list(
make_option(c("-g", "--gct"),help="The full path of gct format file "),
make_option(c("-c", "--cls"),help="The full path of cls format file "),
make_option(c("-m", "--method"),default="limma",help="The full path of cls format file "),
)
opt = parse_args(OptionParser(option_list=option_list))
if(is.null(opt$gct) || is.null(opt$cls)){
stop(sprintf("The gct and cls files is required !!!"))
}
cls_file=opt$gct
gct_file=opt$cls
DEG_method=tolower(opt$method)
TEST=T
if (TEST){
cls_file="P53.cls"
gct_file="P53_hgu95av2.gct"
DEG_method='limma'
}
studyID=sub(".gct","",gct_file)
rmDupID <-function(a=matrix(c(1,1:5,2,2:6,2,3:7),ncol=6)){
exprSet=a[,-1]
rowMeans=apply(exprSet,1,mean)
a=a[order(rowMeans,decreasing=T),]
return(a[!duplicated(a[,1]),])
}

cat(paste("step1:",Sys.time(),"start to do DEG !!! \n"))
a=read.table(gct_file,stringsAsFactors=F,skip=2,header=T,sep="\t")
## probeID/geneID/value1/value2~~~~~~
exprSet=a[,-1]
exprSet=rmDupID(exprSet)
rownames(exprSet)=exprSet[,1]
exprSet=exprSet[,-1]
sample_list=colnames(exprSet)
b=read.table(cls_file,stringsAsFactors=F,skip=2)
group_list=as.character(b)
sample_group=data.frame(sample_list,group_list)
names(sample_group)=c('sample_id','compare')

##liver; males and femlaes human pubmed ID: 20009012
phenodata=read.table("http://bowtie-bio.sourceforge.net/recount/phenotypeTables/gilad_phenodata.txt",header=T)
countdata=read.table("http://bowtie-bio.sourceforge.net/recount/countTables/gilad_count_table.txt",header = T,sep = "\t")
rmDupID <-function(a=matrix(c(1,1:5,2,2:6,2,3:7),ncol=6)){
exprSet=a[,-1]
rowMeans=apply(exprSet,1,mean)
a=a[order(rowMeans,decreasing=T),]
return(a[!duplicated(a[,1]),])
}
exprSet=rmDupID(countdata)
rownames(exprSet)=exprSet[,1]
exprSet=exprSet[,-1]
sample_list=colnames(exprSet)
group_list=as.character(phenodata[,3])

DEG_results=DEG_voom(exprSet,group_list)
DEG_results=DEG_DESeq2(exprSet,group_list)


if(DEG_method=='limma'){
DEG_results=DEG_limma(exprSet,group_list)
}else if (DEG_method=='voom'){
DEG_results=DEG_voom(exprSet,group_list)
}else if (DEG_method=='deseq'){
DEG_results=DEG_DESeq(exprSet,group_list)
}else if (DEG_method=='deseq2'){
DEG_results=DEG_DESeq2(exprSet,group_list)
}else if (DEG_method=='edger'){
DEG_results=DEG_edger(exprSet,group_list)
}else{
stop(sprintf("we don't offer the method you want"))
}


DEG_edger <- function(exprSet=exprSet,group_list=group_list){
suppressMessages(library(edgeR)
d <- DGEList(counts=exprSet,group=factor(group_list))
d.full <- d # keep the old one in case we mess up
apply(d$counts, 2, sum) # total gene counts per samplekeep <- rowSums(cpm(d)>100) >= 2
d <- d[keep,]
d$samples$lib.size <- colSums(d$counts)
d <- calcNormFactors(d)
png("MDS.png")
plotMDS(d, method="bcv", col=as.numeric(d$samples$group))
legend("bottomleft", as.character(unique(d$samples$group)), col=1:3, pch=20)
dev.off()
d1 <- estimateCommonDisp(d, verbose=T)
d1 <- estimateTagwiseDisp(d1)
et12 <- exactTest(d1, pair=c(1,2))
png("MA.png")
de1 <- decideTestsDGE(et12, adjust.method="BH", p.value=0.05)
de1tags12 <- rownames(d1)[as.logical(de1)]
plotSmear(et12, de.tags=de1tags12)
abline(h = c(-2, 2), col = "blue")
dev.off()
topTags(et12, n=10)
}

DEG_limma <- function(exprSet=exprSet,group_list=group_list){
suppressMessages(library(limma))
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
fit <- lmFit(exprSet,design)
png("MA.png")
plotMA(fit)
abline(0,0,col="blue")
dev.off()
fit2 <- eBayes(fit)
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG2 = na.omit(tempOutput)
}

DEG_voom <- function(exprSet=exprSet,group_list=group_list){
suppressMessages(library(limma))
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
v <- voom(exprSet,design,normalize="quantile")
png("MDS.png")
plotMDS(v, labels=1:ncol(exprSet),col=rainbow(ncol(exprSet)))
#abline(0,0,col="blue")
dev.off()
fit <- lmFit(v,design)
png("MA.png")
plotMA(fit)
#abline(0,0,col="blue")
dev.off()
fit2 <- eBayes(fit)
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG2 = na.omit(tempOutput)
}

DEG_DESeq2 <- function(exprSet=exprSet,group_list=group_list){
suppressMessages(library(DESeq2))
exprSet=ceiling(exprSet)
(colData <- data.frame(row.names=colnames(exprSet), group_list=group_list))
dds <- DESeqDataSetFromMatrix(countData = exprSet,
colData = colData,
design = ~ group_list)
dds <- DESeq(dds)
res <- results(dds)
png("MA.png")
#plotMA(res, main="DESeq2", ylim=c(-2,2))
dev.off()
resOrdered <- res[order(res$padj),]
resOrdered=as.data.frame(resOrdered)
}

DEG_DESeq <- function(exprSet=exprSet,group_list=group_list){
suppressMessages(library(DESeq))
exprSet=ceiling(exprSet)
de=newCountDataSet(exprSet,group_list)
de=estimateSizeFactors(de)
de=estimateDispersions(de)
res=nbinomTest(de,"case","control")
rownames(res)=res[,1]
res=res[,-1]
}
Binary file added 8-DEG/DESeq2/DESeq2.pdf
Binary file not shown.
151 changes: 151 additions & 0 deletions 8-DEG/DESeq2/github-R-code.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
## RNA-seq analysis with DESeq2
## Stephen Turner, @genetics_blog

# RNA-seq data from GSE52202
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse52202. All patients with
# ALS, 4 with C9 expansion ("exp"), 4 controls without expansion ("ctl")

# Import & pre-process ----------------------------------------------------

# Import data from featureCounts
## Previously ran at command line something like this:
## featureCounts -a genes.gtf -o counts.txt -T 12 -t exon -g gene_id GSM*.sam
countdata <- read.table("counts.txt", header=TRUE, row.names=1)

# Remove first five columns (chr, start, end, strand, length)
countdata <- countdata[ ,6:ncol(countdata)]

# Remove .bam or .sam from filenames
colnames(countdata) <- gsub("\\.[sb]am$", "", colnames(countdata))

# Convert to matrix
countdata <- as.matrix(countdata)
head(countdata)

# Assign condition (first four are controls, second four contain the expansion)
(condition <- factor(c(rep("ctl", 4), rep("exp", 4))))

# Analysis with DESeq2 ----------------------------------------------------

library(DESeq2)

# Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix
(coldata <- data.frame(row.names=colnames(countdata), condition))
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
dds

# Run the DESeq pipeline
dds <- DESeq(dds)

# Plot dispersions
png("qc-dispersions.png", 1000, 1000, pointsize=20)
plotDispEsts(dds, main="Dispersion plot")
dev.off()

# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)
head(assay(rld))
hist(assay(rld))

# Colors for plots below
## Ugly:
## (mycols <- 1:length(unique(condition)))
## Use RColorBrewer, better
library(RColorBrewer)
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))])

# Sample distance heatmap
sampleDists <- as.matrix(dist(t(assay(rld))))
library(gplots)
png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20)
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[condition], RowSideColors=mycols[condition],
margin=c(10, 10), main="Sample Distance Matrix")
dev.off()

# Principal components analysis
## Could do with built-in DESeq2 function:
## DESeq2::plotPCA(rld, intgroup="condition")
## I like mine better:
rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) {
require(genefilter)
require(calibrate)
require(RColorBrewer)
rv = rowVars(assay(rld))
select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca = prcomp(t(assay(rld)[select, ]))
fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : "))
if (is.null(colors)) {
if (nlevels(fac) >= 3) {
colors = brewer.pal(nlevels(fac), "Paired")
} else {
colors = c("black", "red")
}
}
pc1var <- round(summary(pca)$importance[2,1]*100, digits=1)
pc2var <- round(summary(pca)$importance[2,2]*100, digits=1)
pc1lab <- paste0("PC1 (",as.character(pc1var),"%)")
pc2lab <- paste0("PC1 (",as.character(pc2var),"%)")
plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...)
with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx))
legend(legendpos, legend=levels(fac), col=colors, pch=20)
# rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld),
# pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours),
# terldt = list(levels(fac)), rep = FALSE)))
}
png("qc-pca.png", 1000, 1000, pointsize=20)
rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-75, 35))
dev.off()


# Get differential expression results
res <- results(dds)
table(res$padj<0.05)
## Order by adjusted p-value
res <- res[order(res$padj), ]
## Merge with normalized count data
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)
## Write results
write.csv(resdata, file="diffexpr-results.csv")

## Examine plot of p-values
hist(res$pvalue, breaks=50, col="grey")

## Examine independent filtering
attr(res, "filterThreshold")
plot(attr(res,"filterNumRej"), type="b", xlab="quantiles of baseMean", ylab="number of rejections")

## MA plot
## Could do with built-in DESeq2 function:
## DESeq2::plotMA(dds, ylim=c(-1,1), cex=1)
## I like mine better:
maplot <- function (res, thresh=0.05, labelsig=TRUE, textcx=1, ...) {
with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...))
with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
if (labelsig) {
require(calibrate)
with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
}
}
png("diffexpr-maplot.png", 1500, 1000, pointsize=20)
maplot(resdata, main="MA Plot")
dev.off()

## Volcano plot with "significant" genes labeled
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
if (labelsig) {
require(calibrate)
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
}
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green"))
}
png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20)
volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2))
dev.off()
Loading

0 comments on commit c048257

Please sign in to comment.