forked from ChaissonLab/TT-Mars
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombine.py
302 lines (254 loc) · 11 KB
/
combine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import csv
import sys
import argparse
import fn
import ttmars
# parser = argparse.ArgumentParser()
# parser.add_argument("output_dir",
# help="output directory")
# parser.add_argument("no_X_chr",
# choices=[1, 2],
# help="male sample 1, female sample 2",
# type=int)
# parser.add_argument("-v",
# "--vcf_out",
# help="output results as vcf files, must be used together with -f/--vcf_file",
# action="store_true")
# parser.add_argument("-f",
# "--vcf_file",
# help="input vcf file using as template, must be used together with -v/--vcf_out or -n/--false_neg")
# parser.add_argument("-g",
# "--gt_vali",
# help="conduct genotype validation",
# action="store_true")
# parser.add_argument("-n",
# "--false_neg",
# help="output false negative, must be used together with -t/--truth_file and -f/--vcf_file",
# action="store_true")
# parser.add_argument("-t",
# "--truth_file",
# help="input truth vcf file, must be used together with -n/--false_neg")
# args = parser.parse_args()
# if bool(args.vcf_out) ^ bool(args.vcf_file):
# parser.error('-f/--vcf_file must be used with -v/--vcf_out')
# if bool(args.false_neg) ^ bool(args.truth_file):
# parser.error('-t/--truth_file must be used with -n/--false_neg')
# if (bool(args.false_neg)) and (not bool(args.vcf_file)):
# parser.error('-n/--false_neg must be used with -f/--vcf_file')
# output_dir = args.output_dir + "/"
# if int(args.no_X_chr) == 1:
# if_male = True
# else:
# if_male = False
# if args.vcf_out:
# if_vcf = True
# in_vcf_file = args.vcf_file
# else:
# if_vcf = False
# if args.false_neg:
# output_fn = True
# in_truth_file = args.truth_file
# else:
# output_fn = False
output_dir = ttmars.output_dir
if_male = ttmars.if_male
if_vcf = ttmars.if_vcf
in_vcf_file = ttmars.vcf_file
output_fn = ttmars.output_fn
if output_fn:
in_truth_file = ttmars.in_truth_file
if_gt = ttmars.if_gt
def combine_output():
other_sv_res_file = output_dir+"ttmars_res.txt"
regdup_res_file = output_dir+"ttmars_regdup_res.txt"
agg_res_file = output_dir+"ttmars_agg_res.txt"
with open(other_sv_res_file) as f:
reader = csv.reader(f, delimiter="\t")
other_sv_res = list(reader)
f.close()
if_dup = True
try:
with open(regdup_res_file) as f:
reader = csv.reader(f, delimiter="\t")
regdup_res = list(reader)
f.close()
except:
if_dup = False
if_agg = True
try:
with open(agg_res_file) as f:
reader = csv.reader(f, delimiter="\t")
agg_res = list(reader)
f.close()
except:
if_agg = False
#if output fn
if output_fn:
#input truth set
sv_list = fn.idx_sv(in_truth_file)
#input candidate set
cand_sv_list = fn.idx_sv(in_vcf_file)
sv_list_sorted = fn.sort_sv_list(sv_list)
cand_sv_list_sorted = fn.sort_sv_list(cand_sv_list)
tp_base_ctr = fn.count_tp_base_dist_only(sv_list_sorted, cand_sv_list_sorted)
recall = tp_base_ctr / len(sv_list_sorted)
print("Recall of candidate callset: " + str(recall))
sv_dict = {}
#results of SVs other than interspersed DUP
for rec in other_sv_res:
sv_idx = rec[0]
ref_name = rec[1]
sv_pos = int(rec[2])
sv_end = int(rec[3])
sv_type = rec[4]
if not if_gt:
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type]
else:
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]]
# if not if_gt:
# sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)] = [rec[4], rec[5], rec[6]]
# else:
# sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)] = [rec[4], rec[5], rec[6], rec[7]]
#interspersed DUP
if if_dup:
for rec in regdup_res:
sv_idx = rec[0]
ref_name = rec[1]
sv_pos = int(rec[2])
sv_end = int(rec[3])
sv_type = rec[4]
if sv_idx in sv_dict:
if not if_gt:
if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False':
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type]
else:
if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False':
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]]
elif rec[7] == 'True' and sv_dict[sv_idx][2] == 'True':
if rec[8] == 'True' and sv_dict[sv_idx][3] == 'False':
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]]
else:
if not if_gt:
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type]
else:
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]]
#agg sv
if if_agg:
for rec in agg_res:
sv_idx = rec[0]
ref_name = rec[1]
sv_pos = int(rec[2])
sv_end = int(rec[3])
sv_type = rec[4]
if sv_idx in sv_dict:
if not if_gt:
if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False':
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type]
else:
if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False':
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]]
elif rec[7] == 'True' and sv_dict[sv_idx][2] == 'True':
if rec[8] == 'True' and sv_dict[sv_idx][3] == 'False':
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]]
else:
if not if_gt:
if rec[7] == 'True':
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type]
else:
if rec[7] == 'True':
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]]
if if_male:
chrx_res_file = output_dir+"ttmars_chrx_res.txt"
with open(chrx_res_file) as f:
reader = csv.reader(f, delimiter="\t")
chrx_res = list(reader)
f.close()
for rec in chrx_res:
if len(rec) == 0:
continue
sv_idx = rec[0]
ref_name = rec[1]
sv_pos = int(rec[2])
sv_end = int(rec[3])
sv_type = rec[4]
if sv_idx in sv_dict:
if not if_gt:
if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False':
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type]
else:
if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False':
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]]
elif rec[7] == 'True' and sv_dict[sv_idx][2] == 'True':
if rec[8] == 'True' and sv_dict[sv_idx][3] == 'False':
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]]
else:
if not if_gt:
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type]
else:
sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]]
g = open(output_dir + "/ttmars_combined_res.txt", "w")
for key in sv_dict:
res = sv_dict[key]
g.write(str(key) + "\t")
g.write(str(res[0]) + "\t")
g.write(str(res[1]) + "\t")
g.write(str(res[2]) + "\t")
g.write(str(res[3]) + "\t")
g.write(str(res[4]) + "\t")
g.write(str(res[5]) + "\t")
g.write(str(res[6]))
if if_gt:
g.write("\t" + str(res[7]))
g.write("\n")
g.close()
#if output vcf
if if_vcf:
from pysam import VariantFile
vcf_in = VariantFile(in_vcf_file)
vcfh = vcf_in.header
#vcfh.add_meta('INFO', items=[('ID',"TTMars"), ('Number',1), ('Type','String'),('Description','TT-Mars NA12878 results: TP, FA, NA or .')])
vcfh.add_meta('INFO', items=[('ID',"GT_vali"), ('Number',1), ('Type','String'),('Description','TT-Mars GT validation (require flag -g): True, False or NA')])
vcf_out_tp = VariantFile(output_dir+"ttmars_tp.vcf", 'w', header=vcfh)
vcf_out_fp = VariantFile(output_dir+"ttmars_fp.vcf", 'w', header=vcfh)
vcf_out_na = VariantFile(output_dir+"ttmars_na.vcf", 'w', header=vcfh)
for count, rec in enumerate(vcf_in.fetch()):
key = str(count)
try:
validation_res = sv_dict[key][2]
if validation_res == 'True':
if if_gt:
rec.info['GT_vali'] = sv_dict[key][7]
vcf_out_tp.write(rec)
elif validation_res == 'False':
vcf_out_fp.write(rec)
except:
vcf_out_na.write(rec)
# for rec in vcf_in.fetch():
# ref_name = rec.chrom
# sv_type = rec.info['SVTYPE']
# sv_pos = rec.pos
# sv_end = rec.stop
# try:
# validation_res = sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)][2]
# if validation_res == 'True':
# if if_gt:
# rec.info['GT_vali'] = sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)][3]
# vcf_out_tp.write(rec)
# elif validation_res == 'False':
# vcf_out_fp.write(rec)
# except:
# vcf_out_na.write(rec)
vcf_out_tp.close()
vcf_out_fp.close()
vcf_out_na.close()
vcf_in.close()
#remove files
def remove_files():
import os
for name in ['assem1_non_cov_regions.bed', 'assem2_non_cov_regions.bed',
'exclude_assem1_non_cover.bed', 'exclude_assem2_non_cover.bed',
'SV_positions.bed', 'ttmars_chrx_res.txt', 'ttmars_regdup_res.txt',
'ttmars_res.txt', 'align_info_assem1_chrall.txt', 'align_info_assem2_chrall.txt',
'all_reg_dup.fasta', 'all_reg_dup.fasta.fai']:
if os.path.exists(output_dir + name):
os.remove(output_dir + name)