-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_clientcohort_pages.py
executable file
·336 lines (271 loc) · 17.1 KB
/
make_clientcohort_pages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#!/usr/bin/env python
#
# this script creates a client oriented html document for each cohort of
# a gbs run (or for a specified cohort foilder) , which presents the various plots generated , and
# has links to text output. Its also calls utilities to make a pdf and
# marshall the output to be sent.
#
import os
import re
import itertools
import string
import exceptions
import argparse
header1="""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"httpd://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>
Results for %(title)s
</title>
</head>
<body>
<h1> Results for %(title)s </h1>
<ul>
</p>
"""
footer1="""
</body>
</html>
"""
def get_cohorts(options):
BASEDIR=options["basedir"]
# cohorts are idenitified as subfolders of the run folder that
# * are not tardis working folders (i.e. have names starting with tardis
# * are of like SQ0775.all.TILAPIA.PstI-MspI
# - i.e. library.qc-cohort.gbs-cohort.enzyme
run_folder=os.path.join(BASEDIR, options["run_folder"])
#print "DEBUG : "+run_folder
#SQ0810.all.PstI-MspI.PstI-MspI
#SQ0812.all.ApeKI.ApeKI
#SQ2768.all.ApeKI.ApeKI
#SQ2769.all.ApeKI.ApeKI
#SQ2770.all.ApeKI.ApeKI
cohort_folders=[ node for node in os.listdir(run_folder) if re.search("^tardis", node) is None and \
re.search("^\S+\.\S+\.\S+\.\S+$", node) is not None and os.path.isdir(os.path.join(run_folder, node)) and
re.search("^OLD_", node) is None]
return cohort_folders
def generate_run_plot(options):
BASEDIR=options["basedir"] # e.g. /bifo/scratch/2023_illumina_sequencing_c/postprocessing/gbs
stats = {
"found file count" : 0,
"no file count" : 0,
"no sample count" : 0
}
file_group_iter = ( ("KGD (plots)", "image"), ("KGD (text file links)", "link"), ("Hapmap files", "link" ) )
file_iters = {
"KGD (plots)" : ['AlleleFreq.png', 'CallRate.png', 'Co-call-HWdgm.05.png', 'Co-call-.png', 'finplot.png', \
'GcompareHWdgm.05.png', 'Gcompare.png', 'Gdiagdepth.png', 'GHWdgm.05diagdepth.png', \
'InbCompare.png','Heatmap-G5HWdgm.05.png', 'HWdisMAFsig.png', \
'MAFHWdgm.05.png', 'MAF.png', 'PC1v2G5HWdgm.05.png', 'SampDepthCR.png', 'SampDepthHist.png', \
'SampDepth.png', 'SampDepth-scored.png', 'SNPCallRate.png', 'SNPDepthHist.png', 'SNPDepth.png', \
'X2star-QQ.png', 'PlateDepth.png', 'PlateInb.png', 'SubplateDepth.png', 'SubplateInb.png'],
"KGD (text file links)" : ['GHW05.csv', 'GHW05-Inbreeding.csv', 'GHW05-long.csv', 'GHW05-pca_metadata.tsv', 'GHW05-pca_vectors.tsv', 'GHW05-PC.csv', 'GHW05.RData', 'GHW05.vcf', 'HeatmapOrderHWdgm.05.csv', 'HeatmapOrderHWdgm.05.csv.blinded', 'PCG5HWdgm.05.pdf', 'SampleStats.csv', 'SampleStats.csv.blinded', 'seqID.csv', 'seqID.csv.blinded'],
"Hapmap files" : ['HapMap.hmc.txt','HapMap.fas.txt']
}
file_iters["KGD (plots)"] = [ os.path.join(options["kgd_subfolder_name"], item) for item in file_iters["KGD (plots)"] ]
file_iters["KGD (text file links)"] = [ os.path.join(options["kgd_subfolder_name"], item) for item in file_iters["KGD (text file links)"] ]
file_iters["Hapmap files"] = [ os.path.join(options["hapmap_subfolder_name"], item) for item in file_iters["Hapmap files"] ]
narratives = {
'InbCompare.png' : """
Check clumpify normalisation (deduplication) by comparing four estimates of inbreeding: a) production estimate
b) adjust using fitted beta-binomial (bb) c) sampling one read per lane to remove any duplication effect (used to fit bb)
d) treat two lanes as two individuals. If the points are below the line in the first column (a .v. b,c,d), or alpha
is low, our normalisation may not be aggressive enough, as production inbreeding estimate is thus higher than these other
three methods (each of which should provide a robust estimate of inbreeding, even with un-normalised data)
""",
'AlleleFreq.png' : """
Comparisons SNP reference allele frequencies calculated 3 different ways: 1) after naively converting to genotypes, 2) based on allele counts (without any adjustment for multiple counts of the same allele), 3) as given by UNEAK (the same as method 2).
""",
'CallRate.png' : """
Histogram of call rate (proportion of SNPs scored) for each sample.
""",
'Co-call-HWdgm.05.png' : """
Shows the co-call distribution (see Co-call-.png) after applying the Hardy-Weinberg filter to SNPs and combining lanes (if relevant).
""",
'Co-call-.png' : """
The co-call plot shows the distribution (over all possible pairs) of the proportion of SNPs called in a pair of individuals.
Bimodality may indicate that the samples belong to 2+ genetically divergent clusters (low co-call rate for pairs from different clusters)
""",
'finplot.png' : """
A finplot shows raw Hardy-Weinberg disequilibrium (PAA - pA^2) against MAF for each SNP.
SNPs near the upper border tend to be mainly homozygous. This is often because they have low depth (perhaps only one allele observed).
SNPs near the lower border tend to be mainly heterozygous and may often represent duplicated regions erroneously treated as one region.
The Hardy-Weinberg filter, HWdgm.05 (Hardy-Weinberg disequilibium > -0.05) used in some analyses removes these heterozygous SNPs.
""",
'Gcompare.png' : """
Comparisons of different methods for calculating off-diagonals of the GRM. G5=KGD method, G3=KGD with single allele per genotype, G1=unadjusted (biased towards 0).
""",
'GcompareHWdgm.05.png' : """
Same as Gcompare.png plot but using only those SNPs passing the Hardy-Weinberg filter and combining lanes (if relevant)
""",
'SampDepthHist.png' : """
Distribution of sample depths. If it is not roughly unimodal, there may be contamination or a mix of different species or breeds. Also, some downstream
pipelines use a sample-depth cutoff to filter the data, which could be confused by a multi-modal sample-depth distribution.
""",
'Gdiagdepth.png' : """
In theory self-relatedness estimated from GBS data should be uncorrelated with sample depth, so there should be neither negative nor positive trend
in this plot. A negatively sloping trend has been nick-named a slippery slope, and means something is not quite right somewhere (with either the make-up of the original sample
-e.g. some kind of contamination, or the enzyme digest, or size selection, or sequencing, or downstream processing). If they persist, slippery slopes need to be investigated and
resolved
""",
'GHWdgm.05diagdepth.png' : """
This is similar to the Gdiagdepth.png plot, but 1) combining lanes (if relevant) and 2) based on a smaller set of SNPs (net of the Hardy-Weinberg filter), which eliminates some potential sources of
correlation between self-relatedness estimates and sample depth. So this plot should exhibit less than (or the same) correlation as that plot.
""",
'Heatmap-G5HWdgm.05.png' : """
Heatmap representation of the G5 GRM.
Relatedness values are coloured from white to red (lowest to highest relatedness). Samples are ordered by a dendrogram from hierarchically clustering a distance matrix of the GRM.
SNPs have been Hardy-Weinberg filterd and and lanes combined (if relevant)
""",
'HWdisMAFsig.png' : """
SNPs plotted as in finplot.png but coloured by an approximate depth-adjusted significance value.
""",
'MAF.png' : """
MAF distribution. Diverse populations tend to have a peak at low values of MAF.
""",
'MAFHWdgm.05.png' : """
MAF distribution of Hardy-Weinberg filtered SNPs after combining lanes (if relevant)
""",
'PC1v2G5HWdgm.05.png' : """
Principal components plot (1st 2 coordinates) of samples, based on the G5 GRM.
SNPs have been Hardy-Weinberg filterd and and lanes combined (if relevant)
""",
'SampDepthCR.png' : """
Plot of depth vs call rate for samples. This is usually a fairly tight upwardly curving line.
""",
'SampDepthHist.png' : """
Histogram of sample depths. This is usually unimodal.
""",
'SampDepth.png' : """
Plot of mean vs median sample depth.
""",
'SampDepth-scored.png' : """
Plot of mean sample depth for SNPs with at least one read for the sample against the mean sample depth including all SNPs.
""",
'SNPCallRate.png' : """
Histogram of the proportion of samples called for a SNP. There is usually a peak at a high value and tails off to the left.
""",
'SNPDepthHist.png' : """
Histogram of mean SNP depth. There are often a few SNPs with extremely high depth, while most SNPs have relatively low depth.
The SNPdepth.png plot has SNP depth log-transformed to show the distribution more clearly.
""",
'SNPDepth.png' : """
Plot of SNP call rate against (log-transformed) SNP depth. Usually a S-shaped band.
High depth low call rate SNPs may indicate diverse populations (some SNPs only seen in a subset) (and/or possibly size selection variation?)
""",
'X2star-QQ.png' : """
Quantile-quantile plot of approximate depth-adjusted Hardy-Weinberg test statistic for each SNP. A 1-1 line indicates the theoretical distribution holds.
A higher sloped straight line suggests some population structure. SNPs that plot higher than a straight line through the majority of SNPs may indicate SNPs that do not behave in a Mendelian fashion.
""",
'PlateDepth.png' : """
Plot of mean sample depth by plate position. Patterns of depth variation may represent problems with sample handling.
""",
'PlateInb.png' : """
Plot of estimated inbreeding by plate position. Patterns of inbreeding variation may represent problems with sample handling.
""",
'SubplateDepth.png' : """
The same as PlateDepth.png but with a different colour gradient for each subplate within the main plate.
""",
'SubplateInb.png' : """
The same as PlateInb.png but with a different colour gradient for each subplate within the main plate.
""",
'GUSbase_comet.jpg' : """
Plot to assess the validity of the binomial model for read count data generated using high-throughput sequencing technology.
SNPs tracking an intermediate slope (observed) may be due to non-diploid genome. A downward diagonal streak (expected) is due to an internal cutoff and is associated with the presence of high depth SNPs
"""
}
#print "DEBUG : calling get_cohorts"
if options["cohort_folder" ] is not None:
cohorts = [ options["cohort_folder" ] ]
out_stream_filename = options["output_filename"]
out_manifest_filename = "%s.manifest"%options["output_filename"]
else:
cohorts = get_cohorts(options)
for cohort in cohorts:
if options["cohort_folder" ] is None:
out_stream_filename = os.path.join(BASEDIR, options["run_folder"], "html", cohort, options["output_filename"])
out_manifest_filename = os.path.join(BASEDIR, options["run_folder"], "html", cohort, "%s.manifest"%options["output_filename"])
print cohort, out_stream_filename, out_manifest_filename
with open(out_stream_filename,"w") as out_stream:
with open(out_manifest_filename,"w") as out_manifest:
print >> out_manifest, options["output_filename"]
print >> out_manifest, "%s.manifest"%options["output_filename"]
if options["title"] is None:
print >> out_stream, header1%{"title" : cohort}
else:
print >> out_stream, header1%options
for (file_group, file_type) in file_group_iter:
# output overall header for file group
print >> out_stream, "<h2> %s </h2>\n"%file_group
# output cohort column headings
print >> out_stream, "<table width=90%% align=center>\n", \
"<tr>\n", \
"<td> <h4> Name </h4> </td>\n",\
"\n".join(["<td><h4> %s </h4></td>"%cohort for cohort in [cohort]]), \
"\n</tr>\n"
for file_name in file_iters[file_group]:
print >> out_stream , "<tr><td>%s</td>\n"%file_name
if options["run_folder"] is not None:
file_path = os.path.join(BASEDIR, options["run_folder"], cohort, file_name)
else:
file_path = os.path.join(cohort, file_name)
if file_type == "image":
image_relpath=file_name
print >> out_manifest, image_relpath
if os.path.isfile(file_path):
title = image_relpath
if os.path.basename(file_name) in narratives:
title = "\"%s (%s)\""%(image_relpath, narratives[os.path.basename(file_name)].replace('\n',''))
print >> out_stream, "<td> <img src=%s title=%s height=300 width=300/> </td>\n"%(image_relpath, title)
else:
print >> out_stream, "<td> unavailable </td>\n"
elif file_type == "link":
link_relpath=file_name
print >> out_manifest, link_relpath
if os.path.isfile(file_path):
print >> out_stream, "<td width=300> <a href=%s target=%s> %s </a></td>\n"%(link_relpath, file_name, link_relpath)
else:
print >> out_stream, "<td width=300> unavailable </td>\n"
print >> out_stream , "</tr>\n"
print >> out_stream, "</table>\n"
print >> out_stream, footer1
print stats
def get_options():
description = """
"""
long_description = """
examples :
# for a standard hiseq run
./make_clientcohort_pages.py -r 181112_D00390_0415_BCD04NANXX -o report.html
key file summary looks like :
run run_number lane samplename species file_name
140624_D00390_0044_BH9PEBADXX 0044 1 SQ0001 Deer 140624_D00390_0044_BH9PEBADXX.gbs/SQ0001.processed_sample/uneak/kmer_analysis/kmer_zipfian_comparisons.jpg
140624_D00390_0044_BH9PEBADXX 0044 2 SQ0001 Deer 140624_D00390_0044_BH9PEBADXX.gbs/SQ0001.processed_sample/uneak/kmer_analysis/kmer_zipfian_comparisons.jpg
140904_D00390_0209_BC4U6YACXX 0209 1 SQ0008 Deer 140904_D00390_0209_BC4U6YACXX.gbs/SQ0008.processed_sample/uneak/kmer_analysis/kmer_zipfian_comparisons.jpg
140904_D00390_0209_BC4U6YACXX 0209
# for some custom runs
python ../gbs_prism/make_clientcohort_pages.py -t "Unfiltered run" -o /dataset/gseq_processing/itmp/Gqueries/uneak_kgd/gbsathon_adapter/PstI-MspI/unfiltered.html /dataset/gseq_processing/itmp/Gqueries/uneak_kgd/gbsathon_adapter/PstI-MspI
python ../gbs_prism/make_clientcohort_pages.py -U filtered_hapMap -K filtered_KGD -t "Filtered run" -o /dataset/gseq_processing/itmp/Gqueries/uneak_kgd/gbsathon_adapter/PstI-MspI/filtered.html /dataset/gseq_processing/itmp/Gqueries/uneak_kgd/gbsathon_adapter/PstI-MspI
python ../gbs_prism/make_clientcohort_pages.py -U discarded_hapMap -K discarded_KGD -t "Filtered run - using discarded tags" -o /dataset/gseq_processing/itmp/Gqueries/uneak_kgd/gbsathon_adapter/PstI-MspI/discarded.html /dataset/gseq_processing/itmp/Gqueries/uneak_kgd/gbsathon_adapter/PstI-MspI
"""
parser = argparse.ArgumentParser(description=description, epilog=long_description, formatter_class = argparse.RawDescriptionHelpFormatter)
parser.add_argument('cohort_folder', nargs='?', type=str, default=None)
parser.add_argument('-r', '--run_folder' , dest='run_folder', default = None, type=str, help="run name")
parser.add_argument('-t', '--title' , dest='title', default = None, type=str, help="title")
parser.add_argument('-H', '--image_height' , dest='image_height', default=300, type=int, help="image height")
parser.add_argument('-W', '--image_width' , dest='image_width', default=300, type=int, help="image width")
parser.add_argument('-o', '--output_filename' , dest='output_filename', default="peacock.html", type=str, help="name of output file")
parser.add_argument('-U', '--hapmap_subfolder_name' , dest='hapmap_subfolder_name', default="hapMap", type=str, help="name of hapmap subfolder")
parser.add_argument('-K', '--kgd_subfolder_name' , dest='kgd_subfolder_name', default="KGD", type=str, help="name of KGD sub-folder name")
parser.add_argument('-b', '--basedir' , dest='basedir', default="/dataset/2023_illumina_sequencing_c/scratch/postprocessing/gbs", type=str, help="base dir of original output")
args = vars(parser.parse_args())
if args["run_folder"] is None and args["cohort_folder"] is None:
raise Exception("must specify either run folder or a cohort folder")
return args
def main():
options = get_options()
print options
generate_run_plot(options)
if __name__ == "__main__":
main()