forked from kplaney/curatedBreastCancer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGSE20181.Rout
153 lines (134 loc) · 5.35 KB
/
GSE20181.Rout
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
R version 2.15.0 (2012-03-30)
Copyright (C) 2012 The R Foundation for Statistical Computing
ISBN 3-900051-07-0
Platform: x86_64-unknown-linux-gnu (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
Natural language support but running in an English locale
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
[Previously saved workspace restored]
> #!/usr/bin/env Rscript
>
> #require(utils)
> #print(R.Version())
> #####modified ArrayExpress example from Purvesh
>
>
> #clean up data space
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 716287 38.3 899071 48.1 818163 43.7
Vcells 98824365 754.0 145002199 1106.3 99194798 756.8
> rm(list=ls())
>
> #an Array Express dataset that I need to parse out files, so use Affy
> require("Affy")
Loading required package: Affy
Warning message:
In library(package, lib.loc = lib.loc, character.only = TRUE, logical.return = TRUE, :
there is no package called ‘Affy’
> require(gcrma)
Loading required package: gcrma
Loading required package: affy
Loading required package: BiocGenerics
Attaching package: ‘BiocGenerics’
The following object(s) are masked from ‘package:stats’:
xtabs
The following object(s) are masked from ‘package:base’:
anyDuplicated, cbind, colnames, duplicated, eval, Filter, Find,
get, intersect, lapply, Map, mapply, mget, order, paste, pmax,
pmax.int, pmin, pmin.int, Position, rbind, Reduce, rep.int,
rownames, sapply, setdiff, table, tapply, union, unique
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: BiocInstaller
BiocInstaller version 1.4.7, ?biocLite for help
>
> array = "gpl_96"
>
> filePath = ("~/expr_breastCancerDatasets/GSE20181/prePatients")
> fileNames = list.files(filePath)
>
> setwd("~/expr_breastCancerDatasets/GSE20181/prePatients")
>
> rawExpressionSet = ReadAffy(filenames = fileNames)
>
> setwd("~/expr_breastCancerDatasets/GSE20181")
>
> Pheno = read.delim(header = TRUE, file = "GSE20181_clinical_preTreat.txt")
> #background correct w/ rma, norm with quantiles using gcrma
> dataNorm = gcrma(rawExpressionSet)
Loading required package: AnnotationDbi
Adjusting for optical effect..........................................................Done.
Computing affinities.Done.
Adjusting for non-specific binding..........................................................Done.
Normalizing
Calculating Expression
> source(file = "~/global_R_code/db_functions.r")
> con = connect.db("ywrfc09", "aveelyau05", host="buttelab-db1", dbname="annot_gpl")
Loading required package: DBI
Loading required package: RMySQL
> dataMapping = mapProbesToGenes.db(con, rownames(exprs(dataNorm)), map.to="Symbol", gpl.table=array)
> dbDisconnect(con)
[1] TRUE
> require("limma")
Loading required package: limma
> GSM = strsplit2(colnames(exprs(dataNorm)),".CEL")
> GSM = GSM[,1]
> GSM = substring(GSM,4)
> colnames(exprs(dataNorm)) = GSM
> GSE20181_GPL96 = list(expr=exprs(dataNorm), Pheno = Pheno, GSMID = GSM,ReporterID = rownames(exprs(dataNorm)),GeneSymbol=dataMapping[rownames(exprs(dataNorm)),"gene"])
> boxplot(GSE20181_GPL96$expr, las=3, main="GSE20181_GPL96")
> #manual check on length of keys, expr needs to have this many rows
> length(GSE20181_GPL96$GeneSymbol)
[1] 22283
> #manual check on number of patients I have, expr needs to have this many cols
> length(GSE20181_GPL96$GSMID)
[1] 58
> #manual check that expr is data frame with genes as rows and sample values as cols
> dim(GSE20181_GPL96$expr)
[1] 22283 58
>
> print(GSE20181_GPL96$GSMID[1:10])
[1] "125123" "125125" "125127" "125129" "125131" "125133" "125135" "125137"
[9] "125139" "125141"
>
> #look at pheno - which column is GSMIDs?
> attributes(Pheno)
$names
[1] "GSMID" "GSMID_short"
[3] "orig_studyID" "personID"
[5] "preTreat." "responder."
[7] "treatment_timePoint_Days" "X.Sample_molecule_ch1"
[9] "biopsy_protocol" "hyb_protocol"
[11] "scan_protocol" "analysis"
[13] "array"
$class
[1] "data.frame"
$row.names
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
[51] 51 52 53 54 55 56 57 58
>
> print(GSE20181_GPL96$GeneSymbol[1:20])
1007_s_at 1053_at 117_at 121_at 1255_g_at 1294_at 1316_at 1320_at
"DDR1" "RFC2" "HSPA6" "PAX8" "GUCA1A" "UBA7" "THRA" "PTPN21"
1405_i_at 1431_at 1438_at 1487_at 1494_f_at 1598_g_at 160020_at 1729_at
"CCL5" "CYP2E1" "EPHB3" "ESRRA" "CYP2A6" "GAS6" "MMP14" "TRADD"
1773_at 177_at 179_at 1861_at
"FNTB" "PLD1" "PMS2P11" "BAD"
>
> save(GSE20181_GPL96, file="GSE20181_GPL96.RData")
>
> proc.time()
user system elapsed
188.569 20.518 219.240