-
Notifications
You must be signed in to change notification settings - Fork 0
/
vcfallelefreqs.py
executable file
·218 lines (199 loc) · 8.75 KB
/
vcfallelefreqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/usr/bin/env python2
import argparse
import os
import sys
import numpy
def autocorr(x, t=1):
a = numpy.array([x[0:len(x)-t], x[t:len(x)]]).astype(numpy.float)
return numpy.corrcoef(a)[0][1]
def vcfSortBySample(vcf, samples):
print "vcfsortbysample"
outlines = [line.rstrip() for line in vcf if line.startswith('##')]
header = [s for s in vcf if '#CHROM' in s][0].split('\t')
header[-1] = header[-1].rstrip()
samples = samples.split(',')
sample_dict = {}
for i, sample in enumerate(samples):
sample_dict[i] = header.index(sample)
outheader = header[0:9] + samples
outlines.append('\t'.join(outheader))
for line in vcf:
if not line.startswith('#'):
line = line.split('\t')
line[-1] = line[-1].rstrip()
outline = line[:9]
for i in sample_dict.keys():
outline.append(line[sample_dict[i]])
outlines.append('\t'.join(outline))
print "vcfsortbysample finish"
return outlines
def alleleFreqs(vcf, annots, args):
outdict = {}
print "allelefreqs"
outdict['header'] = '\n'.join([line.rstrip() for line in vcf if line.startswith('#')])
for line in vcf:
if not line.startswith('#'):
# Break down VCF components into array
line = line.split('\t')
# Get the the variant position
outline_pos = line[0:2]
gt_info = line[8].split(':')
dp_index = gt_info.index('DP')
ad_index = gt_info.index('AD')
alleles = [line[3]] + line[4].split(',')
ref_allele = alleles[0]
annot_info = line[7].replace('=',';').split(';')
out_annots = []
for annot in annots:
if annot in annot_info:
annot_index = annot_info.index(annot) + 1
out_annots.append(annot_info[annot_index])
else:
out_annots.append("NA")
for i, allele in enumerate(alleles[1:]):
outline = outline_pos[:]
outline = outline + [ref_allele, allele] + out_annots
dps = []
for s in line[9:]:
s = s.split(':')
# Make sure that genotype fields match description
if len(s) < len(gt_info):
allele_freq = 'NA'
dps.append(0.0)
else:
dp = s[dp_index]
ad = s[ad_index].split(',')
while len(ad) < len(alleles):
ad.append('0')
if dp == '.' or int(dp) == 0:
allele_freq = str(0)
dp = 0
else:
if args.dp_out:
allele_freq = str(dp)
elif args.AF:
af_index = gt_info.index('AF')
af = s[af_index].split(',')
if i < len(af):
allele_freq = str(float(af[i]))
else:
allele_freq = 0
else:
allele_freq = str(float(ad[i + 1]) / float(dp))
dps.append(float(dp))
outline.append(allele_freq)
outline.append(str(sum(dps) / len(dps)))
if '\t'.join(line) not in outdict:
outdict['\t'.join(line)] = ['\t'.join(outline)]
else:
outdict['\t'.join(line)].append('\t'.join(outline))
print "Finished allele freqs"
return outdict
def filterVcfFreqs(vcfdict, vcf, args):
print "filtervcffreqs"
outdict = {'header': vcfdict['header']}
outvcf = []
# Q = 0
for vcfline in vcf:
# Q = Q + 1
if not vcfline.startswith('#'):
for f_line in vcfdict[vcfline]:
freqline = f_line.split('\t')
avg_dp = float(freqline.pop())
annot_extra = len(args.output_annotations.split(','))
freqs = [float(i) for i in freqline[4 + annot_extra:] if i != 'NA']
if (args.min_depth and avg_dp >= float(args.min_depth)) or not args.min_depth:
fdiff = abs(max(freqs) - min(freqs))
avg_freq = sum(freqs) / len(freqs)
if (args.min_freq and float(args.min_freq) <= float(avg_freq) <= float(args.max_freq)) or not args.min_freq:
if args.autocorrelation and len(freqs) > 1:
# print(Q)
a = autocorr(freqs)
else:
a = 0
if (args.autocorrelation and abs(float(a)) >= float(args.autocorrelation)) or not args.autocorrelation:
outfreq = freqline[:4 + annot_extra] + [str(avg_dp), str(fdiff), str(a)] + freqline[4 + annot_extra:]
if vcfline not in outdict:
outdict[vcfline] = ['\t'.join(outfreq)]
outvcf.append(vcfline)
else:
outdict[vcfline].append('\t'.join(outfreq))
else:
outvcf.append(vcfline)
print "filtervcffreqs finish"
return outdict, outvcf
def main():
parser = argparse.ArgumentParser(description='Returns CSV containing allele information and frequency')
parser.add_argument('vcf', metavar='vcf', type=str, nargs=1,
help='VCF file')
parser.add_argument('-o', '--output_base',
help='Output basename. Files will be overwritten if it already exists.')
parser.add_argument('-s', '--samples',
help='Quoted string of samples in the order in which to sort.')
parser.add_argument('-d', '--diff',
help='Minimum difference between max and min frequencies to pass filter.')
parser.add_argument('-a', '--autocorrelation',
help='Minimum autocorrelation with lag=1.')
parser.add_argument('-m', '--min_depth',
help='Minimum average depth for a position.')
parser.add_argument('-f', '--min_freq',
help='Minimum average alternate allele frequency for a position.')
parser.add_argument('-F', '--max_freq',
help='Maximum average alternate allele frequency for a position.', default = 1)
parser.add_argument('-A', '--output_annotations',
help='Annotations to output (separated by commas) e.g MQ,ReadPosRankSum).', default="MQ")
parser.add_argument('-D', '--dp_out',
help='Only output site read depth instead of freq.', action='store_true')
parser.add_argument('-l', '--AF',
help='Use allele frequency information provided by the caller', action='store_true')
args = parser.parse_args()
try:
args.output_base
outvcfpath = args.output_base + ".vcf"
outfreqpath = args.output_base + ".txt"
if os.path.exists(outvcfpath):
os.remove(outvcfpath)
if os.path.exists(outfreqpath):
os.remove(outfreqpath)
try:
outVCFpath = open(outvcfpath, "w+")
except:
print "Could not open file %s" % outvcfpath
sys.exit(1)
try:
outFREQpath = open(outfreqpath, "w+")
except:
print "Could not open file %s" % outfreqpath
sys.exit(1)
except:
pass
try:
vcf = open(args.vcf[0], "r").readlines()
except:
print "Could not open vcf %s" % args.vcfs[0]
sys.exit(1)
if args.samples:
vcf = vcfSortBySample(vcf, args.samples)
annots = args.output_annotations.split(',')
outdict = alleleFreqs(vcf, annots, args)
outdict, outvcf = filterVcfFreqs(outdict, vcf, args)
#args.diff, args.autocorrelation, args.min_depth, args.min_freq,
# args.max_freq)
samples = [s for s in outvcf if '#CHROM' in s][0].split('\t')[9:]
outfreqs = ["CHROM\tPOS\tREF\tALT\t" + '\t'.join(annots) + "\tDP\tDIFF\tRk\t" + '\t'.join(samples).rstrip()]
# print outfreqs[0]
for line in outvcf:
if not line.startswith('#'):
outfreqs = outfreqs + outdict[line]
try:
outFREQpath.write('\n'.join(outfreqs))
except:
print '\n'.join(outfreqs)
try:
outVCFpath.write(''.join(outvcf))
except:
print ''.join(outvcf)
main()
# Issues:
# Chromosomes need to be in correct order in header
# BK006938.2