forked from jmzeng1314/my-R
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6204143
commit c048257
Showing
32 changed files
with
206,854 additions
and
0 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
#suppressMessages(library(affy)) | ||
#suppressMessages(library(limma)) | ||
library(DESeq) | ||
library(limma) | ||
library(DESeq2) | ||
library(edgeR) | ||
|
||
suppressMessages(library(optparse)) | ||
txt_output=T ## whether need txt or not ? | ||
|
||
option_list = list( | ||
make_option(c("-g", "--gct"),help="The full path of gct format file "), | ||
make_option(c("-c", "--cls"),help="The full path of cls format file "), | ||
make_option(c("-m", "--method"),default="limma",help="The full path of cls format file "), | ||
) | ||
opt = parse_args(OptionParser(option_list=option_list)) | ||
if(is.null(opt$gct) || is.null(opt$cls)){ | ||
stop(sprintf("The gct and cls files is required !!!")) | ||
} | ||
cls_file=opt$gct | ||
gct_file=opt$cls | ||
DEG_method=tolower(opt$method) | ||
TEST=T | ||
if (TEST){ | ||
cls_file="P53.cls" | ||
gct_file="P53_hgu95av2.gct" | ||
DEG_method='limma' | ||
} | ||
studyID=sub(".gct","",gct_file) | ||
rmDupID <-function(a=matrix(c(1,1:5,2,2:6,2,3:7),ncol=6)){ | ||
exprSet=a[,-1] | ||
rowMeans=apply(exprSet,1,mean) | ||
a=a[order(rowMeans,decreasing=T),] | ||
return(a[!duplicated(a[,1]),]) | ||
} | ||
|
||
cat(paste("step1:",Sys.time(),"start to do DEG !!! \n")) | ||
a=read.table(gct_file,stringsAsFactors=F,skip=2,header=T,sep="\t") | ||
## probeID/geneID/value1/value2~~~~~~ | ||
exprSet=a[,-1] | ||
exprSet=rmDupID(exprSet) | ||
rownames(exprSet)=exprSet[,1] | ||
exprSet=exprSet[,-1] | ||
sample_list=colnames(exprSet) | ||
b=read.table(cls_file,stringsAsFactors=F,skip=2) | ||
group_list=as.character(b) | ||
sample_group=data.frame(sample_list,group_list) | ||
names(sample_group)=c('sample_id','compare') | ||
|
||
##liver; males and femlaes human pubmed ID: 20009012 | ||
phenodata=read.table("http://bowtie-bio.sourceforge.net/recount/phenotypeTables/gilad_phenodata.txt",header=T) | ||
countdata=read.table("http://bowtie-bio.sourceforge.net/recount/countTables/gilad_count_table.txt",header = T,sep = "\t") | ||
rmDupID <-function(a=matrix(c(1,1:5,2,2:6,2,3:7),ncol=6)){ | ||
exprSet=a[,-1] | ||
rowMeans=apply(exprSet,1,mean) | ||
a=a[order(rowMeans,decreasing=T),] | ||
return(a[!duplicated(a[,1]),]) | ||
} | ||
exprSet=rmDupID(countdata) | ||
rownames(exprSet)=exprSet[,1] | ||
exprSet=exprSet[,-1] | ||
sample_list=colnames(exprSet) | ||
group_list=as.character(phenodata[,3]) | ||
|
||
DEG_results=DEG_voom(exprSet,group_list) | ||
DEG_results=DEG_DESeq2(exprSet,group_list) | ||
|
||
|
||
if(DEG_method=='limma'){ | ||
DEG_results=DEG_limma(exprSet,group_list) | ||
}else if (DEG_method=='voom'){ | ||
DEG_results=DEG_voom(exprSet,group_list) | ||
}else if (DEG_method=='deseq'){ | ||
DEG_results=DEG_DESeq(exprSet,group_list) | ||
}else if (DEG_method=='deseq2'){ | ||
DEG_results=DEG_DESeq2(exprSet,group_list) | ||
}else if (DEG_method=='edger'){ | ||
DEG_results=DEG_edger(exprSet,group_list) | ||
}else{ | ||
stop(sprintf("we don't offer the method you want")) | ||
} | ||
|
||
|
||
DEG_edger <- function(exprSet=exprSet,group_list=group_list){ | ||
suppressMessages(library(edgeR) | ||
d <- DGEList(counts=exprSet,group=factor(group_list)) | ||
d.full <- d # keep the old one in case we mess up | ||
apply(d$counts, 2, sum) # total gene counts per samplekeep <- rowSums(cpm(d)>100) >= 2 | ||
d <- d[keep,] | ||
d$samples$lib.size <- colSums(d$counts) | ||
d <- calcNormFactors(d) | ||
png("MDS.png") | ||
plotMDS(d, method="bcv", col=as.numeric(d$samples$group)) | ||
legend("bottomleft", as.character(unique(d$samples$group)), col=1:3, pch=20) | ||
dev.off() | ||
d1 <- estimateCommonDisp(d, verbose=T) | ||
d1 <- estimateTagwiseDisp(d1) | ||
et12 <- exactTest(d1, pair=c(1,2)) | ||
png("MA.png") | ||
de1 <- decideTestsDGE(et12, adjust.method="BH", p.value=0.05) | ||
de1tags12 <- rownames(d1)[as.logical(de1)] | ||
plotSmear(et12, de.tags=de1tags12) | ||
abline(h = c(-2, 2), col = "blue") | ||
dev.off() | ||
topTags(et12, n=10) | ||
} | ||
|
||
DEG_limma <- function(exprSet=exprSet,group_list=group_list){ | ||
suppressMessages(library(limma)) | ||
design <- model.matrix(~0+factor(group_list)) | ||
colnames(design)=levels(factor(group_list)) | ||
rownames(design)=colnames(exprSet) | ||
fit <- lmFit(exprSet,design) | ||
png("MA.png") | ||
plotMA(fit) | ||
abline(0,0,col="blue") | ||
dev.off() | ||
fit2 <- eBayes(fit) | ||
tempOutput = topTable(fit2, coef=1, n=Inf) | ||
nrDEG2 = na.omit(tempOutput) | ||
} | ||
|
||
DEG_voom <- function(exprSet=exprSet,group_list=group_list){ | ||
suppressMessages(library(limma)) | ||
design <- model.matrix(~0+factor(group_list)) | ||
colnames(design)=levels(factor(group_list)) | ||
rownames(design)=colnames(exprSet) | ||
v <- voom(exprSet,design,normalize="quantile") | ||
png("MDS.png") | ||
plotMDS(v, labels=1:ncol(exprSet),col=rainbow(ncol(exprSet))) | ||
#abline(0,0,col="blue") | ||
dev.off() | ||
fit <- lmFit(v,design) | ||
png("MA.png") | ||
plotMA(fit) | ||
#abline(0,0,col="blue") | ||
dev.off() | ||
fit2 <- eBayes(fit) | ||
tempOutput = topTable(fit2, coef=1, n=Inf) | ||
nrDEG2 = na.omit(tempOutput) | ||
} | ||
|
||
DEG_DESeq2 <- function(exprSet=exprSet,group_list=group_list){ | ||
suppressMessages(library(DESeq2)) | ||
exprSet=ceiling(exprSet) | ||
(colData <- data.frame(row.names=colnames(exprSet), group_list=group_list)) | ||
dds <- DESeqDataSetFromMatrix(countData = exprSet, | ||
colData = colData, | ||
design = ~ group_list) | ||
dds <- DESeq(dds) | ||
res <- results(dds) | ||
png("MA.png") | ||
#plotMA(res, main="DESeq2", ylim=c(-2,2)) | ||
dev.off() | ||
resOrdered <- res[order(res$padj),] | ||
resOrdered=as.data.frame(resOrdered) | ||
} | ||
|
||
DEG_DESeq <- function(exprSet=exprSet,group_list=group_list){ | ||
suppressMessages(library(DESeq)) | ||
exprSet=ceiling(exprSet) | ||
de=newCountDataSet(exprSet,group_list) | ||
de=estimateSizeFactors(de) | ||
de=estimateDispersions(de) | ||
res=nbinomTest(de,"case","control") | ||
rownames(res)=res[,1] | ||
res=res[,-1] | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
## RNA-seq analysis with DESeq2 | ||
## Stephen Turner, @genetics_blog | ||
|
||
# RNA-seq data from GSE52202 | ||
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse52202. All patients with | ||
# ALS, 4 with C9 expansion ("exp"), 4 controls without expansion ("ctl") | ||
|
||
# Import & pre-process ---------------------------------------------------- | ||
|
||
# Import data from featureCounts | ||
## Previously ran at command line something like this: | ||
## featureCounts -a genes.gtf -o counts.txt -T 12 -t exon -g gene_id GSM*.sam | ||
countdata <- read.table("counts.txt", header=TRUE, row.names=1) | ||
|
||
# Remove first five columns (chr, start, end, strand, length) | ||
countdata <- countdata[ ,6:ncol(countdata)] | ||
|
||
# Remove .bam or .sam from filenames | ||
colnames(countdata) <- gsub("\\.[sb]am$", "", colnames(countdata)) | ||
|
||
# Convert to matrix | ||
countdata <- as.matrix(countdata) | ||
head(countdata) | ||
|
||
# Assign condition (first four are controls, second four contain the expansion) | ||
(condition <- factor(c(rep("ctl", 4), rep("exp", 4)))) | ||
|
||
# Analysis with DESeq2 ---------------------------------------------------- | ||
|
||
library(DESeq2) | ||
|
||
# Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix | ||
(coldata <- data.frame(row.names=colnames(countdata), condition)) | ||
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition) | ||
dds | ||
|
||
# Run the DESeq pipeline | ||
dds <- DESeq(dds) | ||
|
||
# Plot dispersions | ||
png("qc-dispersions.png", 1000, 1000, pointsize=20) | ||
plotDispEsts(dds, main="Dispersion plot") | ||
dev.off() | ||
|
||
# Regularized log transformation for clustering/heatmaps, etc | ||
rld <- rlogTransformation(dds) | ||
head(assay(rld)) | ||
hist(assay(rld)) | ||
|
||
# Colors for plots below | ||
## Ugly: | ||
## (mycols <- 1:length(unique(condition))) | ||
## Use RColorBrewer, better | ||
library(RColorBrewer) | ||
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))]) | ||
|
||
# Sample distance heatmap | ||
sampleDists <- as.matrix(dist(t(assay(rld)))) | ||
library(gplots) | ||
png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20) | ||
heatmap.2(as.matrix(sampleDists), key=F, trace="none", | ||
col=colorpanel(100, "black", "white"), | ||
ColSideColors=mycols[condition], RowSideColors=mycols[condition], | ||
margin=c(10, 10), main="Sample Distance Matrix") | ||
dev.off() | ||
|
||
# Principal components analysis | ||
## Could do with built-in DESeq2 function: | ||
## DESeq2::plotPCA(rld, intgroup="condition") | ||
## I like mine better: | ||
rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) { | ||
require(genefilter) | ||
require(calibrate) | ||
require(RColorBrewer) | ||
rv = rowVars(assay(rld)) | ||
select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))] | ||
pca = prcomp(t(assay(rld)[select, ])) | ||
fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : ")) | ||
if (is.null(colors)) { | ||
if (nlevels(fac) >= 3) { | ||
colors = brewer.pal(nlevels(fac), "Paired") | ||
} else { | ||
colors = c("black", "red") | ||
} | ||
} | ||
pc1var <- round(summary(pca)$importance[2,1]*100, digits=1) | ||
pc2var <- round(summary(pca)$importance[2,2]*100, digits=1) | ||
pc1lab <- paste0("PC1 (",as.character(pc1var),"%)") | ||
pc2lab <- paste0("PC1 (",as.character(pc2var),"%)") | ||
plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...) | ||
with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx)) | ||
legend(legendpos, legend=levels(fac), col=colors, pch=20) | ||
# rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld), | ||
# pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours), | ||
# terldt = list(levels(fac)), rep = FALSE))) | ||
} | ||
png("qc-pca.png", 1000, 1000, pointsize=20) | ||
rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-75, 35)) | ||
dev.off() | ||
|
||
|
||
# Get differential expression results | ||
res <- results(dds) | ||
table(res$padj<0.05) | ||
## Order by adjusted p-value | ||
res <- res[order(res$padj), ] | ||
## Merge with normalized count data | ||
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE) | ||
names(resdata)[1] <- "Gene" | ||
head(resdata) | ||
## Write results | ||
write.csv(resdata, file="diffexpr-results.csv") | ||
|
||
## Examine plot of p-values | ||
hist(res$pvalue, breaks=50, col="grey") | ||
|
||
## Examine independent filtering | ||
attr(res, "filterThreshold") | ||
plot(attr(res,"filterNumRej"), type="b", xlab="quantiles of baseMean", ylab="number of rejections") | ||
|
||
## MA plot | ||
## Could do with built-in DESeq2 function: | ||
## DESeq2::plotMA(dds, ylim=c(-1,1), cex=1) | ||
## I like mine better: | ||
maplot <- function (res, thresh=0.05, labelsig=TRUE, textcx=1, ...) { | ||
with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...)) | ||
with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5)) | ||
if (labelsig) { | ||
require(calibrate) | ||
with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2)) | ||
} | ||
} | ||
png("diffexpr-maplot.png", 1500, 1000, pointsize=20) | ||
maplot(resdata, main="MA Plot") | ||
dev.off() | ||
|
||
## Volcano plot with "significant" genes labeled | ||
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) { | ||
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...)) | ||
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...)) | ||
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...)) | ||
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...)) | ||
if (labelsig) { | ||
require(calibrate) | ||
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...)) | ||
} | ||
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green")) | ||
} | ||
png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20) | ||
volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2)) | ||
dev.off() |
Oops, something went wrong.