diff --git a/GWASsteps-concordance.tex b/GWASsteps-concordance.tex index 91fbe03..2297d62 100644 --- a/GWASsteps-concordance.tex +++ b/GWASsteps-concordance.tex @@ -1,4 +1,5 @@ \Sconcordance{concordance:GWASsteps.tex:GWASsteps.Rnw:% 1 35 1 50 0 4 1 5 0 7 1 6 0 6 1 6 0 7 1 5 0 7 1 5 0 5 1 5 0 6 1 4 % 0 26 1 6 0 7 1 5 0 12 1 5 0 10 1 3 0 4 1 5 0 10 1 5 0 7 1 5 0 56 1 % -6 0 11 1 5 0 10 1 5 0 11 1 5 0 35 1 10 0 18 1 7 0 21 1 8 0 5 1} +6 0 11 1 5 0 10 1 5 0 14 1 5 0 14 1 4 0 12 1 7 0 13 1 5 0 33 1 10 % +0 18 1 7 0 21 1 8 0 5 1} diff --git a/GWASsteps.Rnw b/GWASsteps.Rnw index bddc31f..ef71967 100644 --- a/GWASsteps.Rnw +++ b/GWASsteps.Rnw @@ -203,7 +203,7 @@ GWAS<-function(rsNumber){ print(rsNumber) tempSNP <- data.frame(FamID=row.names(genoNum),snp=genoNum[,rsNumber]) dat <- merge(phenoSub,tempSNP,by.x="FamID",by.y="FamID",all.x=TRUE) - a <- summary(glm(fldl.wk0 ~ age + sex + raceth + pc1 + pc2 + pc3 + pc4 + pc5 + pc6 + pc7 + pc8 + pc9 + pc10 + snp, family = gaussian, data=dat)) + a <- summary(glm(fldl_wk0 ~ age + sex + raceth + pc1 + pc2 + pc3 + pc4 + pc5 + pc6 + pc7 + pc8 + pc9 + pc10 + snp, family = gaussian, data=dat)) out <- as.matrix(a$coefficients['snp',]) out } @@ -232,25 +232,65 @@ fldl.wk0.p3 <- data.frame(t(out)) \subsection{Summarizing and Visualization} -We now have a new dataset that we write to a file that contains rsNumber, Estimate, Standard Error, Zvalues and Pvalues +**Put in Merge stuff + + +We now have a new dataset that contains rsNumbers, Estimates, Standard Errors, Zvalues and Pvalues. <>= -names(fldl.wk0.p3) <- c("Estimate","SE","Zvalue","Pvalue") -fldl.wk0.p3$rsNumber <- rsVec -fldl.wk0.p3 <- as.matrix(fldl.wk0.p3) -write.csv(fldl_wk0_p3,"/home/ramoser/fldl.wk0.p3") +names(fldl_wk0_p3) <- c("Estimate","SE","Zvalue","Pvalue") +fldl_wk0_p3$rsNumber <- rsVec +fldl_wk0_p3 <- as.matrix(fldl_wk0_p3) +write.csv(fldl_wk0_p3,"/home/ramoser/fldl-wk0_p3") @ -Below is a function that will create a Manhattan Plot. A Manhattan plot vizualizes where chromosome numbers are displayed along the $x$-axis an the negative logarithm of the association P-value for each SNP on the Y-axis. This is useful when trying to determine if the association between the SNP and the chromosome is significant.\\ +Our new dataset contains only part of the variables needed to accurately summarize the data. We need to be able to determine which SNP cooresponds to which chromosome, gene and type. To do so we load in a dataset that contains this information for the RS numbers and merge with our existing dataset. -* is chromosome the right word? Or homolog or haplotype? -\\ +<>= +baseline1 <- read.csv("fldl_wk0_p3.txt", header = T) +baseline1 <- baseline1[, -1] + +merge1 <- read.table("2013-08-12-AnnotationSNPsToGenes.txt", sep="\t") +merge1$rsNumber <- merge1$snp +merge1 <- merge1[, c(1:5,7)] + +baseline2 <- merge(baseline1, merge1, by = "rsNumber") + +@ + +In this case, we are only interested in looking at the following gene types: exonic, intronic, splicing, UTR3, UTR5, downstream, exonic;splicing, and upstream. Thus, we keep only these types and sort by P-values. \\ + +*Why only these? Is this standard, should I describe each of these gene types? I feel like there is a bit more ``meat" needed here. + +<>= +#Keeping Gene Types:exonic, intronic, splicing, UTR3, UTR5, downstream, exonic;splicing, upstream -*Before doing manhattan plot, should we put in the merging files information: New RS numbers, Keeping Gene Types:exonic, intronic, splicing, UTR3, UTR5, downstream, exonic;splicing, upstream, and extracting only P-vals $< 5*10^-5$. I'm asking because it is only after this merging that we create a table that will allow for the manhattan plot. Or am I mistaken, are we merly showing the plot as an example. +keep1 <- c("exonic", "intronic", "splicing", "UTR3", "UTR5", "downstream", "exonic;splicing", "upstream") +baseline3 <- baseline2[is.element(el=baseline2$type, set=keep1), ] +baseline3 <- baseline3[order(baseline3$Pvalue),] +@ + +In this example we are interested in P-values $< 5*10^{-5}$. Thus, we create a subset of SNPs with those P-values and summarize them in a table. Furthermore, we drop ``chr" to report just chromosome numbers and save the resulting table. + +<>= +baseTable <- baseline3[baseline3$Pvalue <= 5*10^(-5), ] + +baseline3$chr<-substring(baseline3$chr, 4,5) +baseline3$chr<-as.numeric(as.character(baseline3$chr)) + +write.csv(baseline3,"baseline.txt") +xtable(baseTable) +@ + + +Now that we have a complete dataset with SNPs, chromosome, gene type, etc., we can create a Manhattan Plot to view the data. Below is a function that will create a Manhattan Plot. A Manhattan plot vizualizes where chromosome numbers are displayed along the $x$-axis an the negative logarithm of the association P-value for each SNP on the Y-axis. This is useful when trying to determine if the association between the SNP and the chromosome is significant.\\ + +* is chromosome the right word? Or homolog or haplotype? +*Greg, I can't get this code to actually work. Am I missing something? <>= -one=read.table("baseline_table.RData",header=T,sep="\t") +one=read.csv("baseline.txt", header=T) position=one$Pos/1000000 chr=one$Chr nsnp=as.numeric(table(chr))[1:22] @@ -276,7 +316,7 @@ for(CHR in 2:22){ } @ -Now, we can creat the Manhattan Plot +Now, we can creat the Manhattan Plot. <>= bitmap(file="manhattan.jpeg",type="jpeg",width=10,height=6,res=432,pointsize=8) diff --git a/GWASsteps.log b/GWASsteps.log index 46cb2d9..c65c8be 100644 --- a/GWASsteps.log +++ b/GWASsteps.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.1415926-2.4-1.40.13 (MiKTeX 2.9) (preloaded format=pdflatex 2013.3.13) 24 MAR 2014 18:31 +This is pdfTeX, Version 3.1415926-2.4-1.40.13 (MiKTeX 2.9) (preloaded format=pdflatex 2013.3.13) 25 MAR 2014 20:40 entering extended mode **GWASsteps.tex (D:\Users\Emily\Documents\School\RA\GWASPaper\GWASsteps.tex @@ -394,7 +394,7 @@ UE[][])[][] Overfull \hbox (6.89565pt too wide) in paragraph at lines 328--328 -[] []\OT1/cmtt/m/n/10.95 a[] []<-[] []summary[][]([][]glm[][](fldl.wk0[] []~ +[] []\OT1/cmtt/m/n/10.95 a[] []<-[] []summary[][]([][]glm[][](fldl_wk0[] []~ [] []age[] []+[] []sex[] []+[] []raceth[] []+[] []pc1[] []+[] []pc2[] []+[] []p c3[] []+[] []pc4[] []+[][] [] @@ -409,42 +409,53 @@ Overfull \hbox (35.63908pt too wide) in paragraph at lines 329--329 [6] Package color Info: Redefining color shadecolor on input line 346. Package color Info: Redefining color shadecolor on input line 357. -Package color Info: Redefining color shadecolor on input line 374. +Package color Info: Redefining color shadecolor on input line 377. +Package color Info: Redefining color shadecolor on input line 391. + [7] +Underfull \hbox (badness 10000) in paragraph at lines 406--407 -Underfull \hbox (badness 10000) in paragraph at lines 385--386 + [] + +Package color Info: Redefining color shadecolor on input line 411. +Overfull \hbox (104.62334pt too wide) in paragraph at lines 416--416 +[][]\OT1/cmtt/m/n/10.95 keep1[] []<-[] []c[][]([][]"exonic"[][],[] []"intronic" +[][],[] []"splicing"[][],[] []"UTR3"[][],[] []"UTR5"[][],[] []"downstream"[][], +[] []"exonic;splicing"[][],[][] [] -[7] -Underfull \hbox (badness 10000) in paragraph at lines 387--389 +Package color Info: Redefining color shadecolor on input line 428. + +Underfull \hbox (badness 10000) in paragraph at lines 444--445 [] -Package color Info: Redefining color shadecolor on input line 394. +[8] +Package color Info: Redefining color shadecolor on input line 450. -Overfull \hbox (6.89565pt too wide) in paragraph at lines 413--413 +Overfull \hbox (6.89565pt too wide) in paragraph at lines 469--469 [] []\OT1/cmtt/m/n/10.95 POS[(total.prev[] []+[] []1[][])[][]:[][](total.pre v[] []+[] []nsnp[CHR])][] []=[] []start[] []+[] []position[chr[] []==[][] [] -Package color Info: Redefining color shadecolor on input line 433. -[8] -Overfull \hbox (35.63908pt too wide) in paragraph at lines 435--435 +Package color Info: Redefining color shadecolor on input line 489. +[9] +Overfull \hbox (35.63908pt too wide) in paragraph at lines 491--491 [][]\OT1/cmtt/m/n/10.95 bitmap[][]([][]file[] []=[] []"manhattan.jpeg"[][],[] [ ]type[] []=[] []"jpeg"[][],[] []width[] []=[] []10[][],[] []height[] []=[] []6[ ][],[] []res[] []=[] []432[][],[][] [] -Overfull \hbox (35.63908pt too wide) in paragraph at lines 438--438 +Overfull \hbox (35.63908pt too wide) in paragraph at lines 494--494 [][]\OT1/cmtt/m/n/10.95 plot[][](POS,[] []-[][]log10[][](PVAL),[] []pch[] []=[] []20[][],[] []col[] []= COL,[] []xlab[] []=[] []""[][],[] []ylab[] []=[] []"-l og10(p-value)"[][],[][] [] -[9] -Package color Info: Redefining color shadecolor on input line 473. - [10] (D:\Users\Emily\Documents\School\RA\GWASPaper\GWASsteps.aux) ) +[10] +Package color Info: Redefining color shadecolor on input line 529. + [11] (D:\Users\Emily\Documents\School\RA\GWASPaper\GWASsteps.aux) ) Here is how much of TeX's memory you used: 2942 strings out of 493921 38374 string characters out of 3144876 @@ -452,7 +463,7 @@ Here is how much of TeX's memory you used: 6187 multiletter control sequences out of 15000+200000 11064 words of font info for 42 fonts, out of 3000000 for 9000 841 hyphenation exceptions out of 8191 - 27i,12n,43p,1007b,283s stack positions out of 5000i,500n,10000p,200000b,50000s + 27i,11n,43p,1007b,283s stack positions out of 5000i,500n,10000p,200000b,50000s -Output written on GWASsteps.pdf (10 pages, 188484 bytes). +Output written on GWASsteps.pdf (11 pages, 192365 bytes). PDF statistics: - 87 PDF objects out of 1000 (max. 8388607) + 90 PDF objects out of 1000 (max. 8388607) 0 named destinations out of 1000 (max. 500000) 1 words of extra memory for PDF output out of 10000 (max. 10000000) diff --git a/GWASsteps.pdf b/GWASsteps.pdf index bf5f81a..3f52878 100644 Binary files a/GWASsteps.pdf and b/GWASsteps.pdf differ diff --git a/GWASsteps.synctex.gz b/GWASsteps.synctex.gz index d32493a..e5cb2eb 100644 Binary files a/GWASsteps.synctex.gz and b/GWASsteps.synctex.gz differ diff --git a/GWASsteps.tex b/GWASsteps.tex index effd0f2..44cbc23 100644 --- a/GWASsteps.tex +++ b/GWASsteps.tex @@ -325,7 +325,7 @@ \subsection{Fitting the Model} \hlkwd{print}\hlstd{(rsNumber)} \hlstd{tempSNP} \hlkwb{<-} \hlkwd{data.frame}\hlstd{(}\hlkwc{FamID} \hlstd{=} \hlkwd{row.names}\hlstd{(genoNum),} \hlkwc{snp} \hlstd{= genoNum[, rsNumber])} \hlstd{dat} \hlkwb{<-} \hlkwd{merge}\hlstd{(phenoSub, tempSNP,} \hlkwc{by.x} \hlstd{=} \hlstr{"FamID"}\hlstd{,} \hlkwc{by.y} \hlstd{=} \hlstr{"FamID"}\hlstd{,} \hlkwc{all.x} \hlstd{=} \hlnum{TRUE}\hlstd{)} - \hlstd{a} \hlkwb{<-} \hlkwd{summary}\hlstd{(}\hlkwd{glm}\hlstd{(fldl.wk0} \hlopt{~} \hlstd{age} \hlopt{+} \hlstd{sex} \hlopt{+} \hlstd{raceth} \hlopt{+} \hlstd{pc1} \hlopt{+} \hlstd{pc2} \hlopt{+} \hlstd{pc3} \hlopt{+} \hlstd{pc4} \hlopt{+} + \hlstd{a} \hlkwb{<-} \hlkwd{summary}\hlstd{(}\hlkwd{glm}\hlstd{(fldl_wk0} \hlopt{~} \hlstd{age} \hlopt{+} \hlstd{sex} \hlopt{+} \hlstd{raceth} \hlopt{+} \hlstd{pc1} \hlopt{+} \hlstd{pc2} \hlopt{+} \hlstd{pc3} \hlopt{+} \hlstd{pc4} \hlopt{+} \hlstd{pc5} \hlopt{+} \hlstd{pc6} \hlopt{+} \hlstd{pc7} \hlopt{+} \hlstd{pc8} \hlopt{+} \hlstd{pc9} \hlopt{+} \hlstd{pc10} \hlopt{+} \hlstd{snp,} \hlkwc{family} \hlstd{= gaussian,} \hlkwc{data} \hlstd{= dat))} \hlstd{out} \hlkwb{<-} \hlkwd{as.matrix}\hlstd{(a}\hlopt{$}\hlstd{coefficients[}\hlstr{"snp"}\hlstd{, ])} \hlstd{out} @@ -368,32 +368,88 @@ \subsubsection{Running a Parallel Analysis} \subsection{Summarizing and Visualization} -We now have a new dataset that we write to a file that contains rsNumber, Estimate, Standard Error, Zvalues and Pvalues +**Put in Merge stuff + + +We now have a new dataset that contains rsNumbers, Estimates, Standard Errors, Zvalues and Pvalues. \begin{knitrout} \definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} \begin{alltt} -\hlkwd{names}\hlstd{(fldl.wk0.p3)} \hlkwb{<-} \hlkwd{c}\hlstd{(}\hlstr{"Estimate"}\hlstd{,} \hlstr{"SE"}\hlstd{,} \hlstr{"Zvalue"}\hlstd{,} \hlstr{"Pvalue"}\hlstd{)} -\hlstd{fldl.wk0.p3}\hlopt{$}\hlstd{rsNumber} \hlkwb{<-} \hlstd{rsVec} -\hlstd{fldl.wk0.p3} \hlkwb{<-} \hlkwd{as.matrix}\hlstd{(fldl.wk0.p3)} -\hlkwd{write.csv}\hlstd{(fldl_wk0_p3,} \hlstr{"/home/ramoser/fldl.wk0.p3"}\hlstd{)} +\hlkwd{names}\hlstd{(fldl_wk0_p3)} \hlkwb{<-} \hlkwd{c}\hlstd{(}\hlstr{"Estimate"}\hlstd{,} \hlstr{"SE"}\hlstd{,} \hlstr{"Zvalue"}\hlstd{,} \hlstr{"Pvalue"}\hlstd{)} +\hlstd{fldl_wk0_p3}\hlopt{$}\hlstd{rsNumber} \hlkwb{<-} \hlstd{rsVec} +\hlstd{fldl_wk0_p3} \hlkwb{<-} \hlkwd{as.matrix}\hlstd{(fldl_wk0_p3)} +\hlkwd{write.csv}\hlstd{(fldl_wk0_p3,} \hlstr{"/home/ramoser/fldl-wk0_p3"}\hlstd{)} \end{alltt} \end{kframe} \end{knitrout} -Below is a function that will create a Manhattan Plot. A Manhattan plot vizualizes where chromosome numbers are displayed along the $x$-axis an the negative logarithm of the association P-value for each SNP on the Y-axis. This is useful when trying to determine if the association between the SNP and the chromosome is significant.\\ +Our new dataset contains only part of the variables needed to accurately summarize the data. We need to be able to determine which SNP cooresponds to which chromosome, gene and type. To do so we load in a dataset that contains this information for the RS numbers and merge with our existing dataset. -* is chromosome the right word? Or homolog or haplotype? -\\ +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlstd{baseline1} \hlkwb{<-} \hlkwd{read.csv}\hlstd{(}\hlstr{"fldl_wk0_p3.txt"}\hlstd{,} \hlkwc{header} \hlstd{= T)} +\hlstd{baseline1} \hlkwb{<-} \hlstd{baseline1[,} \hlopt{-}\hlnum{1}\hlstd{]} + +\hlstd{merge1} \hlkwb{<-} \hlkwd{read.table}\hlstd{(}\hlstr{"2013-08-12-AnnotationSNPsToGenes.txt"}\hlstd{,} \hlkwc{sep} \hlstd{=} \hlstr{"\textbackslash{}t"}\hlstd{)} +\hlstd{merge1}\hlopt{$}\hlstd{rsNumber} \hlkwb{<-} \hlstd{merge1}\hlopt{$}\hlstd{snp} +\hlstd{merge1} \hlkwb{<-} \hlstd{merge1[,} \hlkwd{c}\hlstd{(}\hlnum{1}\hlopt{:}\hlnum{5}\hlstd{,} \hlnum{7}\hlstd{)]} -*Before doing manhattan plot, should we put in the merging files information: New RS numbers, Keeping Gene Types:exonic, intronic, splicing, UTR3, UTR5, downstream, exonic;splicing, upstream, and extracting only P-vals $< 5*10^-5$. I'm asking because it is only after this merging that we create a table that will allow for the manhattan plot. Or am I mistaken, are we merly showing the plot as an example. +\hlstd{baseline2} \hlkwb{<-} \hlkwd{merge}\hlstd{(baseline1, merge1,} \hlkwc{by} \hlstd{=} \hlstr{"rsNumber"}\hlstd{)} +\end{alltt} +\end{kframe} +\end{knitrout} +In this case, we are only interested in looking at the following gene types: exonic, intronic, splicing, UTR3, UTR5, downstream, exonic;splicing, and upstream. Thus, we keep only these types and sort by P-values. \\ + +*Why only these? Is this standard, should I describe each of these gene types? I feel like there is a bit more ``meat" needed here. + +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlcom{# Keeping Gene Types:exonic, intronic, splicing, UTR3, UTR5, downstream,} +\hlcom{# exonic;splicing, upstream} + +\hlstd{keep1} \hlkwb{<-} \hlkwd{c}\hlstd{(}\hlstr{"exonic"}\hlstd{,} \hlstr{"intronic"}\hlstd{,} \hlstr{"splicing"}\hlstd{,} \hlstr{"UTR3"}\hlstd{,} \hlstr{"UTR5"}\hlstd{,} \hlstr{"downstream"}\hlstd{,} \hlstr{"exonic;splicing"}\hlstd{,} + \hlstr{"upstream"}\hlstd{)} +\hlstd{baseline3} \hlkwb{<-} \hlstd{baseline2[}\hlkwd{is.element}\hlstd{(}\hlkwc{el} \hlstd{= baseline2}\hlopt{$}\hlstd{type,} \hlkwc{set} \hlstd{= keep1), ]} +\hlstd{baseline3} \hlkwb{<-} \hlstd{baseline3[}\hlkwd{order}\hlstd{(baseline3}\hlopt{$}\hlstd{Pvalue), ]} +\end{alltt} +\end{kframe} +\end{knitrout} + + +In this example we are interested in P-values $< 5*10^{-5}$. Thus, we create a subset of SNPs with those P-values and summarize them in a table. Furthermore, we drop ``chr" to report just chromosome numbers and save the resulting table. + +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlstd{baseTable} \hlkwb{<-} \hlstd{baseline3[baseline3}\hlopt{$}\hlstd{Pvalue} \hlopt{<=} \hlnum{5} \hlopt{*} \hlnum{10}\hlopt{^}\hlstd{(}\hlopt{-}\hlnum{5}\hlstd{), ]} + +\hlstd{baseline3}\hlopt{$}\hlstd{chr} \hlkwb{<-} \hlkwd{substring}\hlstd{(baseline3}\hlopt{$}\hlstd{chr,} \hlnum{4}\hlstd{,} \hlnum{5}\hlstd{)} +\hlstd{baseline3}\hlopt{$}\hlstd{chr} \hlkwb{<-} \hlkwd{as.numeric}\hlstd{(}\hlkwd{as.character}\hlstd{(baseline3}\hlopt{$}\hlstd{chr))} + +\hlkwd{write.csv}\hlstd{(baseline3,} \hlstr{"baseline.txt"}\hlstd{)} + +\hlkwd{xtable}\hlstd{(baseTable)} +\end{alltt} +\end{kframe} +\end{knitrout} + + + +Now that we have a complete dataset with SNPs, chromosome, gene type, etc., we can create a Manhattan Plot to view the data. Below is a function that will create a Manhattan Plot. A Manhattan plot vizualizes where chromosome numbers are displayed along the $x$-axis an the negative logarithm of the association P-value for each SNP on the Y-axis. This is useful when trying to determine if the association between the SNP and the chromosome is significant.\\ + +* is chromosome the right word? Or homolog or haplotype? +*Greg, I can't get this code to actually work. Am I missing something? + \begin{knitrout} \definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} \begin{alltt} -\hlstd{one} \hlkwb{=} \hlkwd{read.table}\hlstd{(}\hlstr{"baseline_table.RData"}\hlstd{,} \hlkwc{header} \hlstd{= T,} \hlkwc{sep} \hlstd{=} \hlstr{"\textbackslash{}t"}\hlstd{)} +\hlstd{one} \hlkwb{=} \hlkwd{read.csv}\hlstd{(}\hlstr{"baseline.txt"}\hlstd{,} \hlkwc{header} \hlstd{= T)} \hlstd{position} \hlkwb{=} \hlstd{one}\hlopt{$}\hlstd{Pos}\hlopt{/}\hlnum{1e+06} \hlstd{chr} \hlkwb{=} \hlstd{one}\hlopt{$}\hlstd{Chr} \hlstd{nsnp} \hlkwb{=} \hlkwd{as.numeric}\hlstd{(}\hlkwd{table}\hlstd{(chr))[}\hlnum{1}\hlopt{:}\hlnum{22}\hlstd{]} @@ -427,7 +483,7 @@ \subsection{Summarizing and Visualization} \end{knitrout} -Now, we can creat the Manhattan Plot +Now, we can creat the Manhattan Plot. \begin{knitrout} \definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}