Skip to content

Commit e1dd297

Browse files
committed
add 9/10
1 parent 92961b1 commit e1dd297

File tree

3 files changed

+227
-1
lines changed

3 files changed

+227
-1
lines changed

.DS_Store

8 KB
Binary file not shown.
+215
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
rm(list=ls())
2+
library(reshape2)
3+
library(edgeR)
4+
library(DESeq2)
5+
6+
setwd("G:/mRNA/DEG")
7+
a=read.table('hisat2_mm10_htseq.txt',stringsAsFactors = F)
8+
######################################################################
9+
#ESCTSA01.geneCounts Nek1 2790
10+
#ESCTSA01.geneCounts Nek10 18
11+
#ESCTSA01.geneCounts Nek11 2
12+
#ESCTSA01.geneCounts Nek2 4945
13+
######################################################################
14+
colnames(a)=c('sample','gene','reads')
15+
exprSet=dcast(a,gene~sample)
16+
head(exprSet)
17+
18+
# write.table(exprSet[grep("^__",exprSet$gene),],'hisat2.stats.txt',quote=F,sep='\t')
19+
# exprSet=exprSet[!grepl("^__",exprSet$gene),]
20+
21+
geneLists=exprSet[,1]
22+
exprSet=exprSet[,-1]
23+
head(exprSet)
24+
25+
rownames(exprSet)=geneLists
26+
colnames(exprSet)=do.call(rbind,strsplit(colnames(exprSet),'\\.'))[,1]
27+
28+
write.csv(exprSet,'raw_reads_matrix.csv')
29+
30+
keepGene=rowSums(cpm(exprSet)>0) >=2
31+
table(keepGene);dim(exprSet)
32+
dim(exprSet[keepGene,])
33+
exprSet=exprSet[keepGene,]
34+
rownames(exprSet)=geneLists[keepGene]
35+
36+
str(exprSet)
37+
38+
group_list=c('control','control','treat_12','treat_12','treat_2','treat_2')
39+
40+
write.csv(exprSet,'filter_reads_matrix.csv' )
41+
42+
43+
44+
######################################################################
45+
################### Firstly for DEseq2 #####################
46+
######################################################################
47+
if(T){
48+
49+
suppressMessages(library(DESeq2))
50+
(colData <- data.frame(row.names=colnames(exprSet), group_list=group_list) )
51+
dds <- DESeqDataSetFromMatrix(countData = exprSet,
52+
colData = colData,
53+
design = ~ group_list)
54+
dds <- DESeq(dds)
55+
png("qc_dispersions.png", 1000, 1000, pointsize=20)
56+
plotDispEsts(dds, main="Dispersion plot")
57+
dev.off()
58+
59+
60+
rld <- rlogTransformation(dds)
61+
exprMatrix_rlog=assay(rld)
62+
write.csv(exprMatrix_rlog,'exprMatrix.rlog.csv' )
63+
64+
normalizedCounts1 <- t( t(counts(dds)) / sizeFactors(dds) )
65+
# normalizedCounts2 <- counts(dds, normalized=T) # it's the same for the tpm value
66+
# we also can try cpm or rpkm from edgeR pacage
67+
exprMatrix_rpm=as.data.frame(normalizedCounts1)
68+
head(exprMatrix_rpm)
69+
write.csv(exprMatrix_rpm,'exprMatrix.rpm.csv' )
70+
71+
png("DEseq_RAWvsNORM.png",height = 800,width = 800)
72+
par(cex = 0.7)
73+
n.sample=ncol(exprSet)
74+
if(n.sample>40) par(cex = 0.5)
75+
cols <- rainbow(n.sample*1.2)
76+
par(mfrow=c(2,2))
77+
boxplot(exprSet, col = cols,main="expression value",las=2)
78+
boxplot(exprMatrix_rlog, col = cols,main="expression value",las=2)
79+
hist(as.matrix(exprSet))
80+
hist(exprMatrix_rlog)
81+
dev.off()
82+
83+
library(RColorBrewer)
84+
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(group_list))])
85+
cor(as.matrix(exprSet))
86+
# Sample distance heatmap
87+
sampleDists <- as.matrix(dist(t(exprMatrix_rlog)))
88+
#install.packages("gplots",repos = "http://cran.us.r-project.org")
89+
library(gplots)
90+
png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20)
91+
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
92+
col=colorpanel(100, "black", "white"),
93+
ColSideColors=mycols[group_list], RowSideColors=mycols[group_list],
94+
margin=c(10, 10), main="Sample Distance Matrix")
95+
dev.off()
96+
97+
cor(exprMatrix_rlog)
98+
99+
100+
res <- results(dds, contrast=c("group_list","treat_2","control"))
101+
resOrdered <- res[order(res$padj),]
102+
head(resOrdered)
103+
DEG_treat_2=as.data.frame(resOrdered)
104+
write.csv(DEG_treat_2,"DEG_treat_2_deseq2.results.csv")
105+
106+
res <- results(dds, contrast=c("group_list","treat_12","control"))
107+
resOrdered <- res[order(res$padj),]
108+
head(resOrdered)
109+
DEG_treat_12=as.data.frame(resOrdered)
110+
write.csv(DEG_treat_12,"DEG_treat_12_deseq2.results.csv")
111+
112+
113+
114+
}
115+
116+
######################################################################
117+
################### Then for edgeR #####################
118+
######################################################################
119+
if(T){
120+
121+
library(edgeR)
122+
d <- DGEList(counts=exprSet,group=factor(group_list))
123+
d$samples$lib.size <- colSums(d$counts)
124+
d <- calcNormFactors(d)
125+
d$samples
126+
127+
## The calcNormFactors function normalizes for RNA composition by finding a set of scaling
128+
## factors for the library sizes that minimize the log-fold changes between the samples for most
129+
## genes. The default method for computing these scale factors uses a trimmed mean of Mvalues
130+
## (TMM) between each pair of samples
131+
132+
png('edgeR_MDS.png')
133+
plotMDS(d, method="bcv", col=as.numeric(d$samples$group))
134+
dev.off()
135+
136+
# The glm approach to multiple groups is similar to the classic approach, but permits more general comparisons to be made
137+
138+
dge=d
139+
140+
design <- model.matrix(~0+factor(group_list))
141+
rownames(design)<-colnames(dge)
142+
colnames(design)<-levels(factor(group_list))
143+
144+
dge <- estimateGLMCommonDisp(dge,design)
145+
dge <- estimateGLMTrendedDisp(dge, design)
146+
dge <- estimateGLMTagwiseDisp(dge, design)
147+
148+
fit <- glmFit(dge, design)
149+
150+
lrt <- glmLRT(fit, contrast=c(-1,1,0))
151+
nrDEG=topTags(lrt, n=nrow(exprSet))
152+
nrDEG=as.data.frame(nrDEG)
153+
head(nrDEG)
154+
write.csv(nrDEG,"DEG_treat_12_edgeR.csv",quote = F)
155+
156+
lrt <- glmLRT(fit, contrast=c(-1,0,1) )
157+
nrDEG=topTags(lrt, n=nrow(exprSet))
158+
nrDEG=as.data.frame(nrDEG)
159+
head(nrDEG)
160+
write.csv(nrDEG,"DEG_treat_2_edgeR.csv",quote = F)
161+
summary(decideTests(lrt))
162+
plotMD(lrt)
163+
abline(h=c(-1, 1), col="blue")
164+
}
165+
166+
######################################################################
167+
################### Then for limma/voom #################
168+
######################################################################
169+
170+
171+
suppressMessages(library(limma))
172+
design <- model.matrix(~0+factor(group_list))
173+
colnames(design)=levels(factor(group_list))
174+
rownames(design)=colnames(exprSet)
175+
176+
dge <- DGEList(counts=exprSet)
177+
dge <- calcNormFactors(dge)
178+
logCPM <- cpm(dge, log=TRUE, prior.count=3)
179+
180+
v <- voom(dge,design,plot=TRUE, normalize="quantile")
181+
fit <- lmFit(v, design)
182+
183+
group_list
184+
cont.matrix=makeContrasts(contrasts=c('treat_12-control','treat_2-control'),levels = design)
185+
fit2=contrasts.fit(fit,cont.matrix)
186+
fit2=eBayes(fit2)
187+
188+
tempOutput = topTable(fit2, coef='treat_12-control', n=Inf)
189+
DEG_treat_12_limma_voom = na.omit(tempOutput)
190+
write.csv(DEG_treat_12_limma_voom,"DEG_treat_12_limma_voom.csv",quote = F)
191+
192+
tempOutput = topTable(fit2, coef='treat_2-control', n=Inf)
193+
DEG_treat_2_limma_voom = na.omit(tempOutput)
194+
write.csv(DEG_treat_2_limma_voom,"DEG_treat_2_limma_voom.csv",quote = F)
195+
196+
197+
198+
png("limma_voom_RAWvsNORM.png",height = 600,width = 600)
199+
exprSet_new=v$E
200+
par(cex = 0.7)
201+
n.sample=ncol(exprSet)
202+
if(n.sample>40) par(cex = 0.5)
203+
cols <- rainbow(n.sample*1.2)
204+
par(mfrow=c(2,2))
205+
boxplot(exprSet, col = cols,main="expression value",las=2)
206+
boxplot(exprSet_new, col = cols,main="expression value",las=2)
207+
hist(as.matrix(exprSet))
208+
hist(exprSet_new)
209+
dev.off()
210+
211+
212+
213+
214+
215+

README.md

+12-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
We can use this scripts to do enrichment for the organisms which are not support by Bioconductor package
2121
one of them use the GOstats package to do enrichment ,the other use the function writen by me.
2222
##7-enrichment-with-newest-kegg/
23-
most of the kegg database are to old , for example:org.Hs.egPATH has 5869 entrez genes and 229 pathways
23+
most of the kegg database are too old , for example:
24+
25+
org.Hs.egPATH has 5869 entrez genes and 229 pathways
2426
On Augest 2015, there are 6901 entrez genes and 295 pathways
2527
And now , there are 299 pathways and 6992 genes.
2628
Address the sustainable growth of kegg database, it's essential to update it.
@@ -29,3 +31,12 @@
2931
##8-DEG
3032
there will be a detailed instruction in the folder, please go ahead .
3133
just for differential expression analysis !!!
34+
35+
##9-microarray
36+
37+
38+
##10-RNA-seq-3-groups
39+
there are three groups for this RNA-seq data.
40+
treat_2 vs control , treat_12 vs control.
41+
Using DEseq2,edgeR,and voom(limma).
42+
You can conpare these 3 methods.

0 commit comments

Comments
 (0)