-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_binrepclass_domexpinfo.py
95 lines (68 loc) · 1.9 KB
/
add_binrepclass_domexpinfo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!usr/env/bin python3
"""
Author: Catarina Loureiro
A script to add rep bin classification from gtdbtk to vepe table
"""
import os
import argparse
def get_cmds():
"""
Capture args from the cmdline
"""
parser = argparse.ArgumentParser(description='')
parser.add_argument('-c', '-cla', dest='cla', help='gtdbtk classification',\
required=True, metavar='<file>')
parser.add_argument('-t', '-tsv', dest='tsv', help='bgc info tsv',\
required=True, metavar='<file>')
parser.add_argument('-o', '-out', dest='out', help='updated info tsv',\
required=True, metavar='<file>')
return parser.parse_args()
def build_binrepclass_dict(gtdbtk_file):
"""
extract binrep:class info
gtdbtk_file: str, filepath
gtdbtk_dict: dict{str:str}, binrep:class
"""
fileobj = open(gtdbtk_file, 'r')
gtdbtk_dict = {}
for line in fileobj:
line = line.strip()
if line.startswith('user'):
header = line
continue
elms = line.split('\t')
binrep = elms[0] + '.fa' ## check
class_elms = elms[1].split(';')
class_cut = ';'.join([class_elms[1],class_elms[5],class_elms[6]])
gtdbtk_dict[binrep] = class_cut
fileobj.close()
return gtdbtk_dict
def write_tsv(in_tsv, out_tsv, gtdbtk_dict):
"""
update tsv with binrep info
in_tsv, out_tsv: str, filepath
gtdbtk_dict: dict{str:str}, binrep:class
"""
inobj = open(in_tsv, 'r')
outobj = open(out_tsv, 'w')
for line in inobj:
line = line.strip()
if line.startswith('#'):
outobj.write('{}\tBinRepClass\n'.format(line))
else:
elms = line.split('\t')
binrep = elms[5]
if binrep == '-':
outobj.write('{}\t-\n'.format(line))
else:
try:
outobj.write('{}\t{}\n'.format(line,gtdbtk_dict[binrep]))
except:
outobj.write('{}\t-\n'.format(line))
inobj.close()
outobj.close()
return None
if __name__ == '__main__':
cmds = get_cmds()
dict_gtdbtk = build_binrepclass_dict(cmds.cla)
write_tsv(cmds.tsv, cmds.out, dict_gtdbtk)