-
Notifications
You must be signed in to change notification settings - Fork 1
/
merge_filtered_hapmap.py
executable file
·210 lines (159 loc) · 8.53 KB
/
merge_filtered_hapmap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/bin/env pypy
from __future__ import print_function
import itertools
import sys
import argparse
import re
import os
# simple fasta iter
def fasta_iter(filestream):
record_iter = (record.strip() for record in filestream if len(record.strip()) > 0)
seq_group_iter = itertools.groupby(record_iter, lambda record:{True:"name", False:"seq"}[record[0] == ">"])
name = None
for (group, records) in seq_group_iter:
if group == "name":
name=records.next().split()[0][1:]
else:
yield (name, list(itertools.chain((name,),records)))
def apply_discards(filename, options):
with open(options["discarded_fasta"],"r") as fastastream:
filtered_seqs_list = [ item[0] for item in fasta_iter(fastastream)] # list of names of seqs in the filtered file
#print(filtered_seqs_list[0:5])
tagnames = [ re.match("^(\S+?)_",item).groups()[0] for item in filtered_seqs_list ]
#print(tagnames[0:5])
# process the file to be merged.
# first sniff it to see if it is a fasta file
filetype="tab"
with open(filename,"r") as mergefile:
for record in mergefile:
if re.match("^>", record) is not None:
filetype="fasta"
break
with open(filename,"r") as mergestream:
if filetype == "tab":
mergefile=mergestream
else:
mergefile=fasta_iter(mergestream)
mergename = os.path.join(options["outdir"],os.path.basename(filename))
with open(mergename, "w") as mergeout:
record_count = 0
for record in mergefile: # for tab file will be just a string, for fasta file will be (name, seq)
# if this is the first record it is a heading, so output and
# continue
if record_count == 0 and filetype == "tab":
print(record,end="", file=mergeout)
record_count += 1
continue
if filetype == "tab":
tuples = re.split("\t", record)
else:
tuples = (re.match("^(\S+?)_",record[0]).groups()[0], record[1])
#print(tuples[0])
if tuples[0] in tagnames and record_count > 0:
continue
else:
if filetype == "tab":
print(record,end="", file=mergeout)
else:
print(">%s\n%s"%(record[0], "\n".join(record[1][1:])), file=mergeout)
record_count += 1
def apply_includes(filename, options):
with open(options["included_names"],"r") as namestream:
tagnames = [ re.match("^(\S+?)_",record).groups()[0] for record in namestream ]
# process the file to be merged.
# first sniff it to see if it is a fasta file
filetype="tab"
with open(filename,"r") as mergefile:
for record in mergefile:
if re.match("^>", record) is not None:
filetype="fasta"
break
with open(filename,"r") as mergestream:
if filetype == "tab":
mergefile=mergestream
else:
mergefile=fasta_iter(mergestream)
mergename = os.path.join(options["outdir"],os.path.basename(filename))
with open(mergename, "w") as mergeout:
record_count = 0
for record in mergefile: # for tab file will be just a string, for fasta file will be (name, seq)
# if this is the first record it is a heading, so output and
# continue
if record_count == 0 and filetype == "tab":
print(record,end="", file=mergeout)
record_count += 1
continue
if filetype == "tab":
tuples = re.split("\t", record)
else:
tuples = (re.match("^(\S+?)_",record[0]).groups()[0], record[1])
#print(tuples[0])
if tuples[0] not in tagnames:
continue
else:
if filetype == "tab":
print(record,end="", file=mergeout)
else:
print(">%s\n%s"%(record[0], "\n".join(record[1][1:])), file=mergeout)
record_count += 1
def get_options():
description = """
Use a fasta file of discards ( for example, tags with adapter have been removed e.g. by cutadapt) to filter the
tabular and fasta hapmap files , by removing the discards together with any tag-pair siblings of the discards.
Example counts :
iramohio-01$ wc in.txt out.txt
15498 15498 258234 in.txt ( = all tags in )
15463 15463 257658 out.txt ( = tags out together with tags discarded by cutadapt - the difference (35) is untrimmed sibling of trimmed partner)
reconcile :
tags out :
iramohio-01$ grep ">" /dataset/gseq_processing/scratch/gbs/200730_D00390_0568_BCECP9ANXX/SQ1326.all.PstI.PstI/hapMap_filtered/HapMap.fas.txt | wc
11998 11998 199880
tags in :
iramohio-01$ grep ">" /dataset/gseq_processing/scratch/gbs/200730_D00390_0568_BCECP9ANXX/SQ1326.all.PstI.PstI/hapMap/HapMap.fas.txt | wc
15498 15498 258234
tags discarded :
iramohio-01$ grep ">" /dataset/gseq_processing/scratch/gbs/200730_D00390_0568_BCECP9ANXX/SQ1326.all.PstI.PstI/hapMap_filtered/HapMap.fas.discarded.txt | wc
3465 3465 57778
15498 tags less 3465 discarded (cutadapt trim) less 35 (untrimmed sibling of trimmed partner) = 11,998
Example :
iramohio-01$ diff in.txt out.txt | head
117d116
< >TP10212_hit_64
1279d1277
< >TP1314_hit_64
1794d1791
< >TP14277_query_64 <---------
1806d1802
< >TP14304_query_64
2007d2002
< >TP14868_hit_64
iramohio-01$ grep TP14277 /dataset/gseq_processing/scratch/gbs/200730_D00390_0568_BCECP9ANXX/SQ1326.all.PstI.PstI/hapMap_filtered/HapMap.fas.discarded.txt
>TP14277_hit_64
iramohio-01$
i.e. TP14277_hit_64 was discarded by cutadapt (but not its sibling), but we discard both
"""
long_description = """
Example :
merge_filtered_hapmap.py -D /dataset/gseq_processing/scratch/gbs/200730_D00390_0568_BCECP9ANXX/SQ1326.all.PstI.PstI/hapMap_filtered/HapMap.fas.discarded.txt -O /dataset/gseq_processing/scratch/gbs/200730_D00390_0568_BCECP9ANXX/SQ1326.all.PstI.PstI/hapMap_filtered /dataset/gseq_processing/scratch/gbs/200730_D00390_0568_BCECP9ANXX/SQ1326.all.PstI.PstI/hapMap/HapMap.hmc.txt /dataset/gseq_processing/scratch/gbs/200730_D00390_0568_BCECP9ANXX/SQ1326.all.PstI.PstI/hapMap/HapMap.hmp.txt /dataset/gseq_processing/scratch/gbs/200730_D00390_0568_BCECP9ANXX/SQ1326.all.PstI.PstI/hapMap/HapMap.fas.txt /dataset/gseq_processing/scratch/gbs/200730_D00390_0568_BCECP9ANXX/SQ1326.all.PstI.PstI/hapMap/HapMap.hmp.txt.blinded /dataset/gseq_processing/scratch/gbs/200730_D00390_0568_BCECP9ANXX/SQ1326.all.PstI.PstI/hapMap/HapMap.hmc.txt.blinded
"""
parser = argparse.ArgumentParser(description=description, epilog=long_description, formatter_class = argparse.RawDescriptionHelpFormatter)
parser.add_argument('files', type=str, nargs='*',help='space-seperated list of files to process')
parser.add_argument('-D', '--discarded_fasta', dest='discarded_fasta', type=str, default = None , help="fasta file of discarded tags (e.g. from cutadapt)")
parser.add_argument('-I', '--included_names', dest='included_names', type=str, default = None , help="file containing tag names to be included")
parser.add_argument('-O', '--outdir', dest='outdir', type=str, default = None, required=True , help="out dir")
args = vars(parser.parse_args())
return args
def main():
args=get_options()
for filetodo in args["files"]:
if not os.path.exists(filetodo):
print("(merge_hapMap_filtered.py: %s does not exist so ignoring) "%filetodo)
continue
if args["discarded_fasta"] is not None and args["included_names"] is None:
apply_discards(filetodo, args)
elif args["discarded_fasta"] is None and args["included_names"] is not None:
apply_includes(filetodo, args)
else:
raise Exception("must supply one of discards or included")
if __name__ == "__main__":
main()