-
Notifications
You must be signed in to change notification settings - Fork 0
/
wbbc.py
283 lines (252 loc) · 11.1 KB
/
wbbc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# -*- coding: utf-8 -*-
import os
import statistics
from concurrent.futures import ThreadPoolExecutor, as_completed
VCF_FILENAME = "wbbc_vcf/WBBC.chr{}.GRCh37_PhaseI.vcf"
# https://wbbc.westlake.edu.cn/
# 根据西湖中国样本库,生成祖源模型文件
# The VCF is annotated with rsIDs from dbSNP151, and the following INFO fields:
# AC:Allele count in called genotypes in WBBC
# AF:Allele frequency in called genotypes in WBBC
# AN:Total number of alleles in called genotypes in WBBC
# NS:Total number of samples in called genotypes in WBBC
# North_AF:Allele frequency in North Han Chinese
# North_AN:Total number of alleles in North Han Chinese
# Central_AF:Allele frequency in Central Han Chinese
# Central_AN:Total number of alleles in Central Han Chinese
# South_AF:Allele frequency in South Han Chinese
# South_AN:Total number of alleles in South Han Chinese
# Lingnan_AF:Allele frequency in Lingnan Han Chinese
# Lingnan_AN:Total number of alleles in Lingnan Han Chinese
# DP:Raw read depth
# VQSLOD:Variant Recalibration Score from GATK
def make_allele_frq(
tsvFiles,
highLDFileName="",
modelPath=".",
alleleFrqFile="wbbc",
afDigits=6,
stdDevThreshold=0.03,
maxWorkers=4,
):
try:
# 从TSV文件获取RSID集合
rsid_set = get_rsid(tsvFiles, highLDFileName)
# SNP总计数
snp_total_count = 0
with open(
"{}/{}.alleles".format(modelPath, alleleFrqFile), "w", encoding="utf-8"
) as alleleFile:
with open(
"{}/{}.F".format(modelPath, alleleFrqFile), "w", encoding="utf-8"
) as frqFile:
# 多线程遍历所有VCF文件
with ThreadPoolExecutor(max_workers=maxWorkers) as t:
task_list = []
for chr in range(1, 23):
print("\tTask {} is launching...".format(chr))
task_list.append(
t.submit(
match_snp_from_vcf,
rsid_set,
VCF_FILENAME.format(chr),
afDigits,
stdDevThreshold,
)
)
# 异步等待线程,每个VCF文件处理结果
for task in as_completed(task_list):
allele_list, frq_list = task.result()
if len(allele_list) == len(frq_list):
snp_total_count += len(allele_list)
# 结果写入alleles文件
alleleFile.writelines(allele_list)
alleleFile.flush()
# 结果写入frequency文件
frqFile.writelines(frq_list)
frqFile.flush()
print(
"\tOne task success, {:,} SNPs have been saved in {}.alleles and {}.F respectively".format(
len(allele_list), alleleFrqFile, alleleFrqFile
)
)
else:
print(
"\tOne task failed and not saved to file. The rows of alleles is {}, while the rows of frequency is {}".format(
len(allele_list), len(frq_list)
)
)
print(
"All {} tasks finished. Totally {:,} SNPs have been saved in {}.alleles and {}.F respectively.".format(
len(task_list), snp_total_count, alleleFrqFile, alleleFrqFile
)
)
except FileNotFoundError:
print("文件不存在!")
except PermissionError:
print("无权限访问文件!")
except Exception as e:
print(f"发生未知错误:{e}")
# 在指定的VCF文件中筛选符合条件的SNP
def match_snp_from_vcf(rsid_set, vcfFileName, afDigits, stdDevThreshold):
# 当前VCF文件中筛选出的alleles和frequency列表
allele_list = []
frq_list = []
# 遍历指定的VCF文件
with open(
vcfFileName,
"r",
encoding="utf-8",
) as vcfFile:
print("Processing WBBC vcf file: " + vcfFileName)
# 构造vcf字典,以RSID作为键,REF,ALT和INFO作为键值,只需要SNP,不包括Indel,生成vcf字典集
vcfDict = {}
for vcfLine in vcfFile.readlines():
if len(vcfLine) > 0 and not vcfLine.startswith("#"):
vcfLineList = vcfLine.split("\t")
if (
len(vcfLineList) == 8
and vcfLineList[2] != "."
and len(vcfLineList[3]) == 1
and len(vcfLineList[4]) == 1
):
vcfDict[vcfLineList[2]] = {
"ref": vcfLineList[3],
"alt": vcfLineList[4],
"info": vcfLineList[7],
}
# 遍历rsid模板集合,在vcf字典集查询对应的rsid,如有则加入
for rsid in rsid_set:
if rsid in vcfDict:
infoList = vcfDict[rsid]["info"].split(";")
if len(infoList) == 15:
af = float(infoList[1].split("=")[1])
if af != 0 and af != 1:
# 解析每个人群的alt allele频率
north_af = float(infoList[4].split("=")[1])
central_af = float(infoList[6].split("=")[1])
south_af = float(infoList[8].split("=")[1])
lingnan_af = float(infoList[10].split("=")[1])
# 只统计人群allele频率标准差大于阈值的SNP
if (
statistics.pstdev(
[
north_af,
central_af,
south_af,
lingnan_af,
]
)
< stdDevThreshold
):
continue
# 构造allele数据行,较大的是major突变, 较小的是minor突变
major_allele = (
vcfDict[rsid]["ref"] if af < 0.5 else vcfDict[rsid]["alt"]
)
minor_allele = (
vcfDict[rsid]["ref"] if af >= 0.5 else vcfDict[rsid]["alt"]
)
alleleLine = "{} {} {}\n".format(
rsid,
minor_allele,
major_allele,
)
allele_list.append(alleleLine)
# print("\tCollecting allele data: " + alleleLine)
# 构造frequency数据行,获取每个祖源成分的major突变频率
frqLine = "{} {} {} {}\n".format(
round(
(north_af if af >= 0.5 else 1 - north_af),
afDigits,
),
round(
(central_af if af >= 0.5 else 1 - central_af),
afDigits,
),
round(
(south_af if af >= 0.5 else 1 - south_af),
afDigits,
),
round(
(lingnan_af if af >= 0.5 else 1 - lingnan_af),
afDigits,
),
)
frq_list.append(frqLine)
# print("\tCollecting frequency data: " + frqLine)
return allele_list, frq_list
# 从TSV文件获取RSID集合
def get_rsid(tsvFiles, highLDFileName):
# TSV数据源的RSID集合
rsid_set = set()
# 获取高连锁不平衡数据
hld_list = (
get_high_ld(highLDFileName)
if highLDFileName != "" and highLDFileName != None
else []
)
rsid_in_hld_count = 0
# 常染色体集合
chr_set = {str(chr) for chr in range(1, 23)}
if tsvFiles != None:
for tsvFileName in tsvFiles:
print("Collecting RSID in TSV file {0}...".format(tsvFileName))
with open(tsvFileName, "r", encoding="utf-8") as tsvFile:
for tsvLine in tsvFile.readlines():
if len(tsvLine) > 0 and not tsvLine.startswith(
("#", "\n", "\t", '"')
):
tsvLineList = tsvLine.split("\t")
if (
len(tsvLineList) >= 2
and tsvLineList[0] not in rsid_set
and tsvLineList[1] in chr_set
):
# 去除高连锁不平衡的SNP
inHLD = False
for i in range(len(hld_list)):
if (
tsvLineList[1] == hld_list[i]["chr"]
and int(tsvLineList[2]) >= hld_list[i]["start_pos"]
and int(tsvLineList[2]) <= hld_list[i]["end_pos"]
):
rsid_in_hld_count += 1
inHLD = True
break
if not inHLD:
rsid_set.add(tsvLineList[0])
if len(rsid_set) > 0:
print(
"Finish collecting {:,} RSID in {}\nApart from {:,} RSID in high LD region.".format(
len(rsid_set), " ".join(tsvFiles), rsid_in_hld_count
)
)
else:
raise Exception("There is no available SNP in TSV files.")
return rsid_set
# 获取连锁不平衡数据high linkage disequilibrium
# https://genome.sph.umich.edu/wiki/Regions_of_high_linkage_disequilibrium_(LD)
def get_high_ld(ldFileName):
if not os.access(ldFileName, os.F_OK):
raise Exception(
"High Linkage Disequilibrium file '{}' is not available.".format(ldFileName)
)
ld_list = []
with open(
ldFileName,
"r",
encoding="utf-8",
) as hdFile:
for hdLine in hdFile.readlines():
if len(hdLine) > 0 and not hdLine.startswith("#"):
hdLineList = hdLine.split("\t")
if len(hdLineList) == 3:
ld_list.append(
{
"chr": hdLineList[0],
"start_pos": int(hdLineList[1]),
"end_pos": int(hdLineList[2]),
}
)
return ld_list