forked from kiharalab/VESPER
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_score.py
80 lines (63 loc) · 3.17 KB
/
cluster_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# calcualte the normalized z-score for each of the top 10 models using single-linkage clustering
import argparse
import numpy as np
from collections import defaultdict, Counter
from scipy.cluster.hierarchy import linkage, fcluster
# parse the arguments from command line
parser = argparse.ArgumentParser(description = 'Calculate the normalized z-score for top 10 models from VESPER. Normalized z-scores for top 10 models are written into the output file.')
parser.add_argument('-i', '--input', required = True, action = 'store', dest = 'input_file', help = 'Required. Name of input file.')
parser.add_argument('-c', type = float, action = 'store', dest = 'cutoff', default = 0.2, help = 'Optional. Clustering cutoff ranging from 0 to 1. Default = 0.2.')
parser.add_argument('-o', '--output', action = 'store', dest = 'out_name', help = 'Optional. Name of output file. If not specified, the output file would be named as input filename followed by .normzscore.')
args = parser.parse_args()
input_file = filename = args.input_file
cutoff = args.cutoff
if not args.out_name:
output_name = input_file + '.normzscore'
else:
output_name = args.out_name
# read in score information in input file
with open(filename) as zscore_file:
model_line_start = ['#0', '#1', '#2', '#3', '#4', '#5', '#6', '#7', '#8', '#9']
model_score_info = np.empty(10)
zscore_content = zscore_file.read()
if 'Score=' in zscore_content:
score_list = []
top_model_score_list = []
for line in zscore_content.split('\n'):
if line[0:2] == 'R ':
score = float(line.split()[-1])
score_list.append(score)
elif line[0:2] in model_line_start:
model_num = int(line[1])
score = float(line.split()[-3])
model_score_info[model_num] = score
if np.any(np.isnan(score_list)) or np.any(np.isnan(model_score_info)):
print(score_list)
print(model_score_info)
print('Error: NaN value exists in ' + input_file)
print('Calculation of normalized z-score is not performed')
print('Check ' + input_file + ' to make sure VESPER has run properly')
else:
score_list = np.array(score_list)
max_d = cutoff * (max(score_list) - min(score_list))
single_score_list = [[i] for i in score_list]
# single linkage clustering
clusters = fcluster(linkage(single_score_list, 'single'), max_d, criterion = 'distance')
# calculate mean and std of the largest cluster
cluster_count = Counter(clusters)
first_cluster_index = sorted(cluster_count, key = cluster_count.get, reverse = True)[0]
first_cluster = score_list[clusters == first_cluster_index]
adjust_mean, adjust_std = np.mean(first_cluster), np.std(first_cluster)
output = open(output_name, 'w')
output.write('Normalized z-score for top 10 models:\n')
print('Normalized z-score for top 10 models:')
# calculate normalized z-score for each model
for i in np.arange(10):
old_score = model_score_info[i]
norm_zscore = (old_score - adjust_mean)/adjust_std
output.write('#' + str(i) + '\t' + str(norm_zscore) + '\n')
print('#' + str(i) + '\t' + str(norm_zscore))
output.close()
else:
print('Error: There is no DOT score information in ' + input_file)
print('Check ' + input_file + ' to make sure VESPER has run properly')