12
12
import time
13
13
import urllib3
14
14
import argparse
15
+ import json
16
+ import zlib
17
+ from xml .etree import ElementTree
18
+ from urllib .parse import urlparse , parse_qs , urlencode
19
+ import requests
20
+ from requests .adapters import HTTPAdapter , Retry
21
+
22
+ # constants for uniprot query modification date 12/07/2022
23
+ # constant for uniprot query
24
+ POLLING_INTERVAL = 3
25
+ API_URL = "https://rest.uniprot.org"
26
+ retries = Retry (total = 5 , backoff_factor = 0.25 , status_forcelist = [500 , 502 , 503 , 504 ])
27
+ session = requests .Session ()
28
+ session .mount ("https://" , HTTPAdapter (max_retries = retries ))
15
29
16
30
#############
17
31
# Functions #
18
32
#############
19
33
34
+ def submit_id_mapping (from_db , to_db , ids ):
35
+ request = requests .post (
36
+ f"{ API_URL } /idmapping/run" ,
37
+ data = {"from" : from_db , "to" : to_db , "ids" : "," .join (ids )},
38
+ )
39
+
40
+ request .raise_for_status ()
41
+ session .close ()
42
+ return request .json ()["jobId" ]
43
+
44
+
45
+ def get_next_link (headers ):
46
+ re_next_link = re .compile (r'<(.+)>; rel="next"' )
47
+ if "Link" in headers :
48
+ match = re_next_link .match (headers ["Link" ])
49
+ if match :
50
+ return match .group (1 )
51
+
52
+
53
+ def check_id_mapping_results_ready (job_id ):
54
+ while True :
55
+ request = session .get (f"{ API_URL } /idmapping/status/{ job_id } " )
56
+ request .raise_for_status ()
57
+ j = request .json ()
58
+ if "jobStatus" in j :
59
+ if j ["jobStatus" ] == "RUNNING" :
60
+ # print(f"Retrying in {POLLING_INTERVAL}s")
61
+ time .sleep (POLLING_INTERVAL )
62
+ else :
63
+ raise Exception (request ["jobStatus" ])
64
+ else :
65
+ session .close ()
66
+ return bool (j ["results" ] or j ["failedIds" ])
67
+
68
+
69
+ def get_batch (batch_response , file_format , compressed ):
70
+ batch_url = get_next_link (batch_response .headers )
71
+ while batch_url :
72
+ batch_response = session .get (batch_url )
73
+ batch_response .raise_for_status ()
74
+ yield decode_results (batch_response , file_format , compressed )
75
+ batch_url = get_next_link (batch_response .headers )
76
+
77
+
78
+ def combine_batches (all_results , batch_results , file_format ):
79
+ if file_format == "json" :
80
+ for key in ("results" , "failedIds" ):
81
+ if key in batch_results and batch_results [key ]:
82
+ all_results [key ] += batch_results [key ]
83
+ elif file_format == "tsv" :
84
+ return all_results + batch_results [1 :]
85
+ else :
86
+ return all_results + batch_results
87
+ return all_results
88
+
89
+
90
+ def get_id_mapping_results_link (job_id ):
91
+ url = f"{ API_URL } /idmapping/details/{ job_id } "
92
+ request = session .get (url )
93
+ request .raise_for_status ()
94
+ return request .json ()["redirectURL" ]
95
+
96
+
97
+ def decode_results (response , file_format , compressed ):
98
+ if compressed :
99
+ decompressed = zlib .decompress (response .content , 16 + zlib .MAX_WBITS )
100
+ if file_format == "json" :
101
+ j = json .loads (decompressed .decode ("utf-8" ))
102
+ return j
103
+ elif file_format == "tsv" :
104
+ return [line for line in decompressed .decode ("utf-8" ).split ("\n " ) if line ]
105
+ elif file_format == "xlsx" :
106
+ return [decompressed ]
107
+ elif file_format == "xml" :
108
+ return [decompressed .decode ("utf-8" )]
109
+ else :
110
+ return decompressed .decode ("utf-8" )
111
+ elif file_format == "json" :
112
+ return response .json ()
113
+ elif file_format == "tsv" :
114
+ return [line for line in response .text .split ("\n " ) if line ]
115
+ elif file_format == "xlsx" :
116
+ return [response .content ]
117
+ elif file_format == "xml" :
118
+ return [response .text ]
119
+ return response .text
120
+
121
+
122
+ def get_xml_namespace (element ):
123
+ m = re .match (r"\{(.*)\}" , element .tag )
124
+ return m .groups ()[0 ] if m else ""
125
+
126
+
127
+ def merge_xml_results (xml_results ):
128
+ merged_root = ElementTree .fromstring (xml_results [0 ])
129
+ for result in xml_results [1 :]:
130
+ root = ElementTree .fromstring (result )
131
+ for child in root .findall ("{http://uniprot.org/uniprot}entry" ):
132
+ merged_root .insert (- 1 , child )
133
+ ElementTree .register_namespace ("" , get_xml_namespace (merged_root [0 ]))
134
+ return ElementTree .tostring (merged_root , encoding = "utf-8" , xml_declaration = True )
135
+
136
+
137
+ def print_progress_batches (batch_index , size , total ):
138
+ n_fetched = min ((batch_index + 1 ) * size , total )
139
+ # print(f"Fetched: {n_fetched} / {total}")
140
+
141
+
142
+ def get_id_mapping_results_search (url ):
143
+ parsed = urlparse (url )
144
+ query = parse_qs (parsed .query )
145
+ file_format = query ["format" ][0 ] if "format" in query else "json"
146
+ if "size" in query :
147
+ size = int (query ["size" ][0 ])
148
+ else :
149
+ size = 500
150
+ query ["size" ] = size
151
+ compressed = (
152
+ query ["compressed" ][0 ].lower () == "true" if "compressed" in query else False
153
+ )
154
+ parsed = parsed ._replace (query = urlencode (query , doseq = True ))
155
+ url = parsed .geturl ()
156
+ request = session .get (url )
157
+ request .raise_for_status ()
158
+ results = decode_results (request , file_format , compressed )
159
+ total = int (request .headers ["x-total-results" ])
160
+ print_progress_batches (0 , size , total )
161
+ for i , batch in enumerate (get_batch (request , file_format , compressed ), 1 ):
162
+ results = combine_batches (results , batch , file_format )
163
+ print_progress_batches (i , size , total )
164
+ if file_format == "xml" :
165
+ return merge_xml_results (results )
166
+ return results
167
+
168
+ def get_id_mapping_results_stream (url ):
169
+ if "/stream/" not in url :
170
+ url = url .replace ("/results/" , "/stream/" )
171
+ request = session .get (url )
172
+ request .raise_for_status ()
173
+ parsed = urlparse (url )
174
+ query = parse_qs (parsed .query )
175
+ file_format = query ["format" ][0 ] if "format" in query else "json"
176
+ compressed = (
177
+ query ["compressed" ][0 ].lower () == "true" if "compressed" in query else False
178
+ )
179
+ return decode_results (request , file_format , compressed )
20
180
21
181
def resultsFormat (res , dico ):
22
182
'''
@@ -53,7 +213,7 @@ def resultsFormat(res, dico):
53
213
54
214
def getENAidMatchingToUniProtid (uniprotAccessions , batchesSize , PoolManager ):
55
215
'''
56
- Allows the correspondence between a UniProt accession and nuclotide accessions.
216
+ Allows the correspondence between a UniProt accession and nuclotide accessions.9
57
217
Batch splitting to lighten the request.
58
218
'''
59
219
logger = logging .getLogger ('{}.{}' .format (
@@ -63,14 +223,42 @@ def getENAidMatchingToUniProtid(uniprotAccessions, batchesSize, PoolManager):
63
223
nbEntriesProcessed = 0
64
224
logger .info (
65
225
'Beginning of the correspondence between the UniProt and ENA identifiers...' )
226
+ crossReference = {}
66
227
while uniprotAccessions :
67
- accessions = '+OR+id:' .join (uniprotAccessions [:batchesSize ])
228
+
229
+ # accessions = '+OR+accession_id:'.join(uniprotAccessions[:batchesSize])
68
230
# print("accesions {}".format(accessions))
69
- res = common .httpRequest (
70
- PoolManager , 'GET' , 'https://www.uniprot.org/uniprot/?query=id:{}&columns=id,database(EMBL),database(EMBL_CDS)&format=tab' .format (accessions ))
71
- crossReference = resultsFormat (res , crossReference )
231
+ # res = common.httpRequest(
232
+ # PoolManager, 'GET', 'https://www.uniprot.org/uniprot/?query=id:{}&columns=id,database(EMBL),database(EMBL_CDS)&format=tab'.format(accessions))
233
+ # PoolManager, 'GET', 'https://rest.uniprot.org/uniprotkb/search?query=accession_id:{}&fields=accession,xref_embl&format=tsv'.format(accessions))
234
+ # crossReference = resultsFormat(res, crossReference)
72
235
# logger.info('uniprot {} crossref {}'.format(accessions, crossReference))
73
- nbEntriesProcessed += len (uniprotAccessions [:batchesSize ])
236
+ # nbEntriesProcessed += len(uniprotAccessions[:batchesSize])
237
+
238
+ accessions = uniprotAccessions [:batchesSize ]
239
+ job_id = submit_id_mapping (from_db = "UniProtKB_AC-ID" , to_db = "EMBL-GenBank-DDBJ_CDS" , ids = accessions )
240
+ if check_id_mapping_results_ready (job_id ):
241
+ link = get_id_mapping_results_link (job_id )
242
+ results = get_id_mapping_results_search (link )
243
+ session .close ()
244
+ data = results ['results' ]
245
+ for row in data :
246
+ if row ['from' ] not in crossReference :
247
+ crossReference [row ['from' ]]= {}
248
+ crossReference [row ['from' ]]['Cross-reference (embl)' ]= []
249
+ crossReference [row ['from' ]]['Cross-reference (EMBL)' ]= []
250
+ crossReference [row ['from' ]]['Cross-reference (embl)' ].append (row ['to' ])
251
+
252
+ job_id = submit_id_mapping (from_db = "UniProtKB_AC-ID" , to_db = "EMBL-GenBank-DDBJ" , ids = accessions )
253
+ if check_id_mapping_results_ready (job_id ):
254
+ link = get_id_mapping_results_link (job_id )
255
+ results = get_id_mapping_results_search (link )
256
+ session .close ()
257
+ data = results ['results' ]
258
+ for row in data :
259
+ if row ['from' ] in crossReference :
260
+ crossReference [row ['from' ]]['Cross-reference (EMBL)' ].append (row ['to' ])
261
+
74
262
del uniprotAccessions [:batchesSize ]
75
263
logger .info (
76
264
'Correspondences computed: {}/{}' .format (nbEntriesProcessed , nbTotalEntries ))
@@ -131,7 +319,7 @@ def getEMBLfromENA(nucleicAccession, nucleicFilePath, PoolManager):
131
319
132
320
def run (InputName ):
133
321
'''
134
- Get INSDC files porocessing .
322
+ Get INSDC files processing .
135
323
'''
136
324
# Constants
137
325
boxName = common .global_dict ['boxName' ]['GetINSDCFiles' ]
@@ -153,7 +341,7 @@ def run(InputName):
153
341
urllib3 .disable_warnings (urllib3 .exceptions .InsecureRequestWarning )
154
342
http = urllib3 .PoolManager ()
155
343
crossReference = getENAidMatchingToUniProtid (
156
- list (accessions ), 250 , http )
344
+ list (accessions ), 200 , http )
157
345
withoutENAidNb = len (accessions )- len (crossReference )
158
346
reportingMessages .append (
159
347
'Targets without ENA correspondence number: {}/{}' .format (withoutENAidNb , len (accessions )))
@@ -293,6 +481,7 @@ def main():
293
481
'inputClusteringStep' , '{}_correspondences.tsv' .format (args .OutputName ))
294
482
common .global_dict .setdefault ('files' , {}).setdefault (boxName , {}).setdefault (
295
483
'report' , '{}_{}_report.txt' .format (args .OutputName , boxName ))
484
+
296
485
#######
297
486
# Run #
298
487
#######
0 commit comments