-
Notifications
You must be signed in to change notification settings - Fork 4
/
benchmark_mmtf_python.py
151 lines (137 loc) · 5.1 KB
/
benchmark_mmtf_python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import random
import os
from timeit import default_timer as timer
import gzip
try:
from Bio.PDB import FastMMCIFParser
from Bio.PDB import MMCIFParser
from Bio.PDB import PDBParser
from Bio.PDB.mmtf import MMTFParser
from Bio.PDB import PDBList
except Exception:
print("please have Biopython installed (eg: pip install biopython)")
try:
import wget
except Exception:
print("please have wget installed (eg: pip install wget)")
#all files to test
#infiles = ['sample_3j3q.csv','sample_25.csv','sample_50.csv','sample_75.csv','sample_1000.csv']
infiles = ['sample_25.csv']
#3 types of files and the "FastMMCIFParser" to be tested
file_types = ['mmtf','cif','fast_cif' ,'pdb']
'''
funtion that reads protein list file and store in a list
*takes file with list of protein as input
'''
get_proteins = lambda x: [p[:4].upper() for p in open(x,'r')]
'''
download all proteins from the sample protein listin all 3 formats from pdb and store in local directories
* file_type: the type of file to be downloaded (mmtf, pdb, mmcif)
* all_proteins: the list of all proteins to be downloaded
'''
def get_all_protein(file_type,proteins):
#iterate over all proteins and download their mmtf files
if file_type == 'fast_cif': file_type = 'cif'
for protein in proteins:
if file_type == 'pdb' and protein == '3J3Q':
print("\n 3j3q not in pdb website")
continue
if file_type == 'mmtf':
url = "http://mmtf.rcsb.org/v1.0/full/%s.mmtf.gz"%(protein)
filename = "%s.mmtf.gz"%(protein)
elif file_type == 'cif':
url = "https://files.rcsb.org/download/%s.cif.gz"%(protein)
filename = "%s.cif.gz"%(protein)
else:
url = "https://files.rcsb.org/download/%s.pdb.gz"%(protein)
filename = "%s.pdb.gz"%(protein)
wget.download(url)
#uncompress gzipped file to file
infile = gzip.open(filename,'rb')
outfile = open(filename[:-3],'wb')
outfile.write(infile.read())
infile.close()
outfile.close()
cwd = os.getcwd()
os.remove(cwd + '/' + filename)
#sort proteins into directories by their middle two characters
directory = "%s/%s"%(cwd,file_type)
if not os.path.exists(directory):
os.makedirs(directory)
if not os.path.exists("%s/%s.%s"%(directory,protein,file_type)):
os.rename("%s/%s.%s"%(cwd,protein,file_type),\
"%s/%s.%s"%(directory,protein,file_type))
else:
os.remove("%s/%s.%s"%(cwd,protein,file_type))
'''
Time download for each file type
*takes a list of protein as input
'''
def time_download(infile,proteins):
time_keeper = []
for file_type in file_types:
if file_type == 'fast_cif': continue
begin = timer()
get_all_protein(file_type,proteins)
terminal = timer()
result = "download all %s files took %f seconds"%(file_type,terminal-begin)
time_keeper.append(result)
#write benchmark results to text file
with open("download_benchmark_results.csv","w") as o:
for t in time_keeper:
o.write(t + '\n')
'''
Get parsing time for all proteins in protein list for specific format
Parameters:
* file_type: the type of file to be parsed (mmtf, pdb, mmcif)
* proteins: the list of proteins to be parsed
* rep: number of repetition each protein is timed (default to 10)
'''
def loop_parsing(file_type,proteins,rep = 10):
cwd = os.getcwd()
if file_type == 'mmtf': parser = MMTFParser()
elif file_type == 'fast_cif': parser = FastMMCIFParser()
elif file_type == 'cif': parser = MMCIFParser()
else: parser = PDBParser()
for p in proteins:
if file_type == "fast_cif": file_type = "cif"
directory = "%s/%s/%s.%s"%(cwd,file_type,p,file_type)
try:
if file_type == 'mmtf':
protein = parser.get_structure(directory)
else:
protein = parser.get_structure(random.randint(0,100),directory)
except Exception:
print("Having trouble parsing %s"%(p))
break
return
'''
Generates timing_results.txt which contains:
* takes a list of protein as input
* total parsing time for each format
'''
def time_parsing(proteins):
total_time = {}
#iterate and time all filetypes
for file_type in file_types:
begin = timer()
loop_parsing(file_type,proteins)
terminal = timer ()
total_time[file_type] = terminal-begin
return total_time
'''
Main funciton for benchmark
'''
if __name__ == '__main__':
infile_time = {}
#loop over all files and benchmark
for f in infiles:
proteins = get_proteins(f)
time_download(f,proteins)
infile_time[f] = time_parsing(proteins)
#write benchmark results to text file
with open("parsing_benchmark_results.csv","w") as o:
o.write("File,MMTF,MMCIF,FastMMCIF,PDB \n")
for f in infiles:
o.write(f[:-4] + "," + str(infile_time[f]["mmtf"])+","+str(infile_time[f]["cif"])+ \
"," + str(infile_time[f]["fast_cif"])+","+ str(infile_time[f]["pdb"]) + "\n")