-
Notifications
You must be signed in to change notification settings - Fork 0
/
vcfcombine.py
executable file
·179 lines (166 loc) · 6.55 KB
/
vcfcombine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python2
import sys
import argparse
import re
import os
import sys
import numpy
if sys.version_info[0] < 3:
from StringIO import StringIO
else:
from io import StringIO
import pandas as pd
###############################################
def findUnion(vcf1, vcf2, combine):
outlines = [line.rstrip() for line in vcf1 if line.startswith('##')]
samples1 = [s for s in vcf1 if '#CHROM' in s][0].split('\t')
samples1[-1] = samples1[-1].rstrip()
header = samples1[:]
samples2 = [s for s in vcf2 if '#CHROM' in s][0].split('\t')
samples2[-1] = samples2[-1].rstrip()
new_samples2 = []
old_samples2 = {}
for s in samples2[9:]:
if combine:
if s not in samples1:
header.append(s)
new_samples2.append(samples2.index(s))
else:
index1 = samples1.index(s)
index2 = samples2.index(s)
old_samples2[index2] = index1
else:
new_samples2.append(samples2.index(s))
if s in samples1[9:]:
s = "%s.X" % s
header.append(s)
outlines.append('\t'.join(header))
for line1 in vcf1:
if not line1.startswith('#'):
# Cycle lines of first VCF
line1 = line1.split('\t')
line1[-1] = line1[-1].rstrip()
match = '\t'.join(line1[0:2]) + '\t'
# If matching position in second VCF try to reconcile
line2 = [l.rstrip() for l in vcf2 if l.startswith(match)]
if len(line2) > 0:
line2 = line2[0].split('\t')
line2[-1] = line2[-1].rstrip()
# Reconcile differences in alt alleles
if line1[4] != line2[4]:
alt1 = line1[4].split(',')
alt2 = line2[4].split(',')
alt_all = alt1[:]
alt2_dict = {0: 0}
for i, a in enumerate(alt2):
if a not in alt1:
alt_all.append(a)
alt2_dict[i + 1] = alt_all.index(a) + 1
outline = line1[:4]
outline.append(','.join(alt_all))
outline = outline + line1[5:]
for s in new_samples2:
samp_data = line2[s].split(':')
genotype = re.split("[/|]", samp_data[0])
for i, haplotype in enumerate(genotype):
genotype[i] = str(alt2_dict[int(haplotype)])
genotype.sort()
genotype = '/'.join(genotype)
samp_data = [genotype] + samp_data[1:]
outline.append(':'.join(samp_data))
outlines.append('\t'.join(outline))
else:
outline = line1
for s in new_samples2:
outline.append(line2[s])
outlines.append('\t'.join(outline))
else:
outline = line1
field_info_len = len(line1[8].split(':')) - 1
new_sampleinfo = ':'.join(['.'] * field_info_len)
new_sampleinfo = "0/0:" + new_sampleinfo
for s in new_samples2:
outline.append(new_sampleinfo)
outlines.append('\t'.join(outline))
count = 0
for line1 in vcf2:
if not line1.startswith('#'):
line1 = line1.split('\t')
line1[-1] = line1[-1].rstrip()
match = '\t'.join(line1[0:2])
line2 = [l.rstrip() for l in vcf1 if l.startswith(match)]
if len(line2) == 0:
count = count + 1
outline = line1[:9]
field_info_len = len(line1[8].split(':')) - 1
new_sampleinfo = ':'.join(['.'] * field_info_len)
new_sampleinfo = "0/0:" + new_sampleinfo
for s in samples1[9:]:
outline.append(new_sampleinfo)
for s in new_samples2:
outline.append(line1[s])
for s in old_samples2.keys():
outline[old_samples2[s]] = line1[s]
outlines.append('\t'.join(outline))
return outlines
def sortVcf(vcf):
outlines = [line.rstrip() for line in vcf if line.startswith('#')]
contig_lines = [line.rstrip() for line in vcf if line.startswith('##contig=<ID=')]
if len(contig_lines) == 0:
print "Contig order not provided in header. Returning unsorted VCF."
return vcf
else:
contigs = []
for line in contig_lines:
contigs.append(re.split('[=,]', line)[2])
for contig in contigs:
match = contig + '\t'
contigmatches = [line.rstrip() for line in vcf if line.startswith(match)]
if len(contigmatches) > 0:
contigdata = StringIO('\n'.join(contigmatches))
pddata = pd.read_table(contigdata, header=None)
pddata.sort_values(by=1)
for line in pddata.values:
linelist = [str(i) for i in line.tolist()]
outlines.append('\t'.join(linelist))
return outlines
def main():
parser = argparse.ArgumentParser(description='Returns a VCF of all unique variant in all input VCFs.')
parser.add_argument('vcfs', metavar='vcf', type=str, nargs='+',
help='VCF file')
parser.add_argument('-o', '--output_file',
help='Output VCF file. File will be overwritten if it already exists.')
parser.add_argument('-c', '--combine',
help='Combine samples with same name into one column.', action="store_true")
args = parser.parse_args()
try:
args.output_file
if os.path.exists(args.output_file):
os.remove(args.output_file)
try:
out = open(args.output_file, "w+")
except:
print "Could not open file %s" % args.output_file
sys.exit(1)
except:
pass
try:
vcf1 = open(args.vcfs[0], "r").readlines()
except:
print "Could not open vcf %s" % args.vcfs[0]
sys.exit(1)
del args.vcfs[0]
while len(args.vcfs) >= 1:
try:
vcf2 = open(args.vcfs[0], "r").readlines()
except:
print "Could not open vcf %s" % args.vcfs[0]
sys.exit(1)
del args.vcfs[0]
vcf1 = findUnion(vcf1, vcf2, args.combine)
vcf1 = sortVcf(vcf1)
try:
out.write('\n'.join(vcf1))
except:
print '\n'.join(vcf1)
main()