-
Notifications
You must be signed in to change notification settings - Fork 1
/
autorescale.py
161 lines (148 loc) · 6.25 KB
/
autorescale.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import argparse
import random
# Function for parsing cmap files
def parse_cmap(cmapf, keep_length=False):
cmaps = {}
# contigCovs = {}
with open(cmapf) as infile:
for line in infile:
if line.startswith("#h"):
head = line.rstrip().rsplit()[1:]
elif not line.startswith("#"):
fields = line.rstrip().rsplit()
fD = dict(zip(head, fields))
if fD["CMapId"] not in cmaps:
cmaps[fD["CMapId"]] = {}
# contigCovs[fD["CMapId"]] = {}
# this is not a good way to parse label channel means color channel
if fD["LabelChannel"] == "1":
cmaps[fD["CMapId"]][int(fD["SiteID"])] = float(fD["Position"])
# contigCovs[fD["CMapId"]][int(fD["SiteID"])] = float(fD["Coverage"])
elif fD["LabelChannel"] == "0" and keep_length:
cmaps[fD["CMapId"]][int(fD["SiteID"])] = float(fD["Position"])
return cmaps
#function for parsing bnx files
def parse_bnx(bnxF, keep_length=False):
moleculeD = {}
with open(bnxF) as infile:
for line in infile:
if not line.startswith('#'):
fields = line.rstrip().rsplit("\t")
if line.startswith('0'):
currKey = fields[1]
elif line.startswith('1'):
if keep_length:
moleculeD[currKey] = [float(x) for x in fields[1:]]
else:
moleculeD[currKey] = [float(x) for x in fields[1:-1]]
return moleculeD
# size of downsample for detecting ratio
sampleSize = 250
parser = argparse.ArgumentParser()
# Query Dir
parser.add_argument("-q", "--query", help="query directory", required=True)
# Ref Dir
parser.add_argument("-r", "--ref", help="reference directory", required=True)
# Path to folder of FaNDOM
parser.add_argument("-f", "--fandom", help="FaNDOM directory", required=False)
# Output Dir
parser.add_argument("-o", "--output", help="Output directory", required=True)
# Number of thread
parser.add_argument("-t", "--thread", help="Number of Thread", required=True)
# Sample size
parser.add_argument("-s", "--samplesize", help="Sample size for detecting rescale factor", required=False)
args = parser.parse_args()
fandom_dir_path = ''
if args.fandom is not None :
fandom_dir_path = args.fandom
if fandom_dir_path[-1]!='/':
fandom_dir_path = fandom_dir_path + '/'
if str(args.samplesize)!="None":
sampleSize= int(args.samplesize)
#Only parse cmap or bnx files
if args.query.endswith('.bnx'):
query = parse_bnx(args.query,True)
elif args.query.endswith('.cmap'):
query = parse_cmap(args.query,True)
else:
print('Query file is not in cmap or bnx format')
quit()
print('Finish Parsing Query')
# if args.fandom.endswith('/'):
# fandom = args.fandom
# else:
# fandom = args.fandom+'/'
# print(fandom)
if not args.ref.endswith('.cmap'):
print('Reference should be in cmap format')
quit()
if args.output.endswith('/'):
output = args.output[:-1]
else:
output = args.output
#Down sample molecules
selected = [list(query.keys())[i] for i in random.sample(range(0, len(query)), sampleSize)]
#contail scaled moleculs
scaledSample = {}
#contain result and their score
result = {}
for i in range(-4, 21):
#write new resclaed sample on file
with open(output+'_a.cmap','w') as f :
for k in selected:
scaledSample[k] = query[k].copy()
if args.query.endswith('.bnx'):
scaledSample[k] = {i+1:j for i,j in enumerate(scaledSample[k])}
for index in scaledSample[k].keys():
scaledSample[k][index] =float(scaledSample[k][index] * (100-i))/100
length = scaledSample[k][list(scaledSample[k])[-1]]
number_label = index
for index in scaledSample[k]:
if index ==number_label:
f.write(k + '\t' + str(length) + '\t' + str(number_label) + '\t' + str(index) + '\t0\t' + str(
scaledSample[k][index]) + '\t0.0\t1.0\t1.0\t17.0000\t0.0000\n')
else:
f.write(k + '\t'+str(length)+'\t'+str(number_label)+'\t'+str(index)+'\t1\t'+str(scaledSample[k][index])+'\t0.0\t1.0\t1.0\t17.0000\t0.0000\n')
#Run Fandom on new resclaed molecules
os.system(fandom_dir_path+'./FaNDOM'+' -t='+args.thread+' -r='+args.ref+' -q='+output+'_a.cmap -sname='+output+'_test '+ '-outfmt=xmap -no_partial')
sum_score = 0
#Sum scores of molecule
with open(output+'_test.xmap','r') as f:
for line in f:
if not line.startswith('#'):
line = line.split('\t')
sum_score += float(line[8])
result[i] = sum_score
print(result)
os.system('rm '+output+'_a.cmap')
max_index = int(max(result,key=result.get))
print("############################################################")
print('Max score with rescale factor: ', max_index)
print("############################################################")
#Rescla all molecules with detected rescale factor
if args.query.endswith('.bnx'):
with open(output+'_rescaled.bnx','w') as f:
with open(args.query,'r') as g:
f.write('#Rescaled Factor= '+str(max_index)+'\n')
for line in g:
if line.startswith('0\t'):
line = line.strip().split('\t')
line[2] = str((float(line[2]) * (100-max_index))/100)
line = '\t'.join(line) + '\n'
elif line.startswith('1\t'):
line = line.strip().split('\t')
for i in range(1,len(line)):
line[i] = str((float(line[i]) * (100-max_index))/100)
line = '\t'.join(line) + '\n'
f.write(line)
if args.query.endswith('.cmap'):
with open(output + '_rescaled.cmap', 'w') as f:
with open(args.query, 'r') as g:
for line in g:
if not line.startswith('#'):
line = line.strip().split('\t')
line[1] = str((float(line[1]) * (100-max_index))/100)
line[5] = str((float(line[5]) * (100 - max_index)) / 100)
line = '\t'.join(line) + '\n'
f.write(line)