Skip to content

Commit afda8cf

Browse files
committed
Modification uniprot query and correct bug in partial graph
1 parent c7540d8 commit afda8cf

File tree

5 files changed

+241
-11
lines changed

5 files changed

+241
-11
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,8 @@ scripts_test/
33
**/*__pycache__/
44
.vscode/
55
files_example/
6+
.gitignore
7+
build/
8+
netsyn.egg-info/
9+
netsyn/log
10+
dist/

README.md

+14
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,20 @@ For the installation of MMseqs2 please refer to https://github.com/soedinglab/MM
2626

2727
- biopython
2828

29+
- requests
30+
31+
You can easely install using an virtual environment (command lines example below):
32+
virtualenv venv_netsyn
33+
source venv_netsyn/bin/activate
34+
pip install pyyaml
35+
pip install python-igraph
36+
pip install jsonschema
37+
pip install networkx
38+
pip install markov_clustering
39+
pip install urllib3
40+
git clone https://github.com/labgem/netsyn
41+
python3 setup.py install
42+
2943
## Basic Usage
3044

3145
NetSyn can be used with 2 different input file formats. One is a file containing a list of UniProt accessions (`-u` option), while the other one is a correspondences file (`-c` option). The two types of file are described in the Input Data part. It is possible to start an analysis with both input file formats. It leads to 3 NetSyn basic usage callings:

netsyn/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
#!/usr/bin/env python3
22

3-
__version__ = '0.1.0'
3+
__version__ = '0.1.1'

netsyn/netsyn_getINSDCFiles.py

+197-8
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,171 @@
1212
import time
1313
import urllib3
1414
import argparse
15+
import json
16+
import zlib
17+
from xml.etree import ElementTree
18+
from urllib.parse import urlparse, parse_qs, urlencode
19+
import requests
20+
from requests.adapters import HTTPAdapter, Retry
21+
22+
# constants for uniprot query modification date 12/07/2022
23+
# constant for uniprot query
24+
POLLING_INTERVAL = 3
25+
API_URL = "https://rest.uniprot.org"
26+
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
27+
session = requests.Session()
28+
session.mount("https://", HTTPAdapter(max_retries=retries))
1529

1630
#############
1731
# Functions #
1832
#############
1933

34+
def submit_id_mapping(from_db, to_db, ids):
35+
request = requests.post(
36+
f"{API_URL}/idmapping/run",
37+
data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
38+
)
39+
40+
request.raise_for_status()
41+
session.close()
42+
return request.json()["jobId"]
43+
44+
45+
def get_next_link(headers):
46+
re_next_link = re.compile(r'<(.+)>; rel="next"')
47+
if "Link" in headers:
48+
match = re_next_link.match(headers["Link"])
49+
if match:
50+
return match.group(1)
51+
52+
53+
def check_id_mapping_results_ready(job_id):
54+
while True:
55+
request = session.get(f"{API_URL}/idmapping/status/{job_id}")
56+
request.raise_for_status()
57+
j = request.json()
58+
if "jobStatus" in j:
59+
if j["jobStatus"] == "RUNNING":
60+
# print(f"Retrying in {POLLING_INTERVAL}s")
61+
time.sleep(POLLING_INTERVAL)
62+
else:
63+
raise Exception(request["jobStatus"])
64+
else:
65+
session.close()
66+
return bool(j["results"] or j["failedIds"])
67+
68+
69+
def get_batch(batch_response, file_format, compressed):
70+
batch_url = get_next_link(batch_response.headers)
71+
while batch_url:
72+
batch_response = session.get(batch_url)
73+
batch_response.raise_for_status()
74+
yield decode_results(batch_response, file_format, compressed)
75+
batch_url = get_next_link(batch_response.headers)
76+
77+
78+
def combine_batches(all_results, batch_results, file_format):
79+
if file_format == "json":
80+
for key in ("results", "failedIds"):
81+
if key in batch_results and batch_results[key]:
82+
all_results[key] += batch_results[key]
83+
elif file_format == "tsv":
84+
return all_results + batch_results[1:]
85+
else:
86+
return all_results + batch_results
87+
return all_results
88+
89+
90+
def get_id_mapping_results_link(job_id):
91+
url = f"{API_URL}/idmapping/details/{job_id}"
92+
request = session.get(url)
93+
request.raise_for_status()
94+
return request.json()["redirectURL"]
95+
96+
97+
def decode_results(response, file_format, compressed):
98+
if compressed:
99+
decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
100+
if file_format == "json":
101+
j = json.loads(decompressed.decode("utf-8"))
102+
return j
103+
elif file_format == "tsv":
104+
return [line for line in decompressed.decode("utf-8").split("\n") if line]
105+
elif file_format == "xlsx":
106+
return [decompressed]
107+
elif file_format == "xml":
108+
return [decompressed.decode("utf-8")]
109+
else:
110+
return decompressed.decode("utf-8")
111+
elif file_format == "json":
112+
return response.json()
113+
elif file_format == "tsv":
114+
return [line for line in response.text.split("\n") if line]
115+
elif file_format == "xlsx":
116+
return [response.content]
117+
elif file_format == "xml":
118+
return [response.text]
119+
return response.text
120+
121+
122+
def get_xml_namespace(element):
123+
m = re.match(r"\{(.*)\}", element.tag)
124+
return m.groups()[0] if m else ""
125+
126+
127+
def merge_xml_results(xml_results):
128+
merged_root = ElementTree.fromstring(xml_results[0])
129+
for result in xml_results[1:]:
130+
root = ElementTree.fromstring(result)
131+
for child in root.findall("{http://uniprot.org/uniprot}entry"):
132+
merged_root.insert(-1, child)
133+
ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
134+
return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
135+
136+
137+
def print_progress_batches(batch_index, size, total):
138+
n_fetched = min((batch_index + 1) * size, total)
139+
# print(f"Fetched: {n_fetched} / {total}")
140+
141+
142+
def get_id_mapping_results_search(url):
143+
parsed = urlparse(url)
144+
query = parse_qs(parsed.query)
145+
file_format = query["format"][0] if "format" in query else "json"
146+
if "size" in query:
147+
size = int(query["size"][0])
148+
else:
149+
size = 500
150+
query["size"] = size
151+
compressed = (
152+
query["compressed"][0].lower() == "true" if "compressed" in query else False
153+
)
154+
parsed = parsed._replace(query=urlencode(query, doseq=True))
155+
url = parsed.geturl()
156+
request = session.get(url)
157+
request.raise_for_status()
158+
results = decode_results(request, file_format, compressed)
159+
total = int(request.headers["x-total-results"])
160+
print_progress_batches(0, size, total)
161+
for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
162+
results = combine_batches(results, batch, file_format)
163+
print_progress_batches(i, size, total)
164+
if file_format == "xml":
165+
return merge_xml_results(results)
166+
return results
167+
168+
def get_id_mapping_results_stream(url):
169+
if "/stream/" not in url:
170+
url = url.replace("/results/", "/stream/")
171+
request = session.get(url)
172+
request.raise_for_status()
173+
parsed = urlparse(url)
174+
query = parse_qs(parsed.query)
175+
file_format = query["format"][0] if "format" in query else "json"
176+
compressed = (
177+
query["compressed"][0].lower() == "true" if "compressed" in query else False
178+
)
179+
return decode_results(request, file_format, compressed)
20180

21181
def resultsFormat(res, dico):
22182
'''
@@ -53,7 +213,7 @@ def resultsFormat(res, dico):
53213

54214
def getENAidMatchingToUniProtid(uniprotAccessions, batchesSize, PoolManager):
55215
'''
56-
Allows the correspondence between a UniProt accession and nuclotide accessions.
216+
Allows the correspondence between a UniProt accession and nuclotide accessions.9
57217
Batch splitting to lighten the request.
58218
'''
59219
logger = logging.getLogger('{}.{}'.format(
@@ -63,14 +223,42 @@ def getENAidMatchingToUniProtid(uniprotAccessions, batchesSize, PoolManager):
63223
nbEntriesProcessed = 0
64224
logger.info(
65225
'Beginning of the correspondence between the UniProt and ENA identifiers...')
226+
crossReference = {}
66227
while uniprotAccessions:
67-
accessions = '+OR+id:'.join(uniprotAccessions[:batchesSize])
228+
229+
# accessions = '+OR+accession_id:'.join(uniprotAccessions[:batchesSize])
68230
# print("accesions {}".format(accessions))
69-
res = common.httpRequest(
70-
PoolManager, 'GET', 'https://www.uniprot.org/uniprot/?query=id:{}&columns=id,database(EMBL),database(EMBL_CDS)&format=tab'.format(accessions))
71-
crossReference = resultsFormat(res, crossReference)
231+
# res = common.httpRequest(
232+
# PoolManager, 'GET', 'https://www.uniprot.org/uniprot/?query=id:{}&columns=id,database(EMBL),database(EMBL_CDS)&format=tab'.format(accessions))
233+
# PoolManager, 'GET', 'https://rest.uniprot.org/uniprotkb/search?query=accession_id:{}&fields=accession,xref_embl&format=tsv'.format(accessions))
234+
# crossReference = resultsFormat(res, crossReference)
72235
# logger.info('uniprot {} crossref {}'.format(accessions, crossReference))
73-
nbEntriesProcessed += len(uniprotAccessions[:batchesSize])
236+
# nbEntriesProcessed += len(uniprotAccessions[:batchesSize])
237+
238+
accessions=uniprotAccessions[:batchesSize]
239+
job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="EMBL-GenBank-DDBJ_CDS", ids=accessions)
240+
if check_id_mapping_results_ready(job_id):
241+
link = get_id_mapping_results_link(job_id)
242+
results = get_id_mapping_results_search(link)
243+
session.close()
244+
data=results['results']
245+
for row in data:
246+
if row['from'] not in crossReference:
247+
crossReference[row['from']]={}
248+
crossReference[row['from']]['Cross-reference (embl)']=[]
249+
crossReference[row['from']]['Cross-reference (EMBL)']=[]
250+
crossReference[row['from']]['Cross-reference (embl)'].append(row['to'])
251+
252+
job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="EMBL-GenBank-DDBJ", ids=accessions)
253+
if check_id_mapping_results_ready(job_id):
254+
link = get_id_mapping_results_link(job_id)
255+
results = get_id_mapping_results_search(link)
256+
session.close()
257+
data=results['results']
258+
for row in data:
259+
if row['from'] in crossReference:
260+
crossReference[row['from']]['Cross-reference (EMBL)'].append(row['to'])
261+
74262
del uniprotAccessions[:batchesSize]
75263
logger.info(
76264
'Correspondences computed: {}/{}'.format(nbEntriesProcessed, nbTotalEntries))
@@ -131,7 +319,7 @@ def getEMBLfromENA(nucleicAccession, nucleicFilePath, PoolManager):
131319

132320
def run(InputName):
133321
'''
134-
Get INSDC files porocessing.
322+
Get INSDC files processing.
135323
'''
136324
# Constants
137325
boxName = common.global_dict['boxName']['GetINSDCFiles']
@@ -153,7 +341,7 @@ def run(InputName):
153341
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
154342
http = urllib3.PoolManager()
155343
crossReference = getENAidMatchingToUniProtid(
156-
list(accessions), 250, http)
344+
list(accessions), 200, http)
157345
withoutENAidNb = len(accessions)-len(crossReference)
158346
reportingMessages.append(
159347
'Targets without ENA correspondence number: {}/{}'.format(withoutENAidNb, len(accessions)))
@@ -293,6 +481,7 @@ def main():
293481
'inputClusteringStep', '{}_correspondences.tsv'.format(args.OutputName))
294482
common.global_dict.setdefault('files', {}).setdefault(boxName, {}).setdefault(
295483
'report', '{}_{}_report.txt'.format(args.OutputName, boxName))
484+
296485
#######
297486
# Run #
298487
#######

netsyn/netsyn_syntenyFinder.py

+24-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,26 @@
1818
# Functions #
1919
#############
2020

21+
def fix_dendrogram(graph, cl):
22+
already_merged = set()
23+
for merge in cl.merges:
24+
already_merged.update(merge)
25+
26+
num_dendrogram_nodes = graph.vcount() + len(cl.merges)
27+
not_merged_yet = sorted(set(range(num_dendrogram_nodes)) - already_merged)
28+
if len(not_merged_yet) < 2:
29+
return (graph, cl)
30+
31+
v1, v2 = not_merged_yet[:2]
32+
cl._merges.append((v1, v2))
33+
del not_merged_yet[:2]
34+
35+
missing_nodes = range(num_dendrogram_nodes,
36+
num_dendrogram_nodes + len(not_merged_yet))
37+
cl._merges.extend(zip(not_merged_yet, missing_nodes))
38+
cl._nmerges = graph.vcount()-1
39+
return (graph, cl)
40+
2141

2242
def get_userGC(targets_info, windowSize):
2343
''' reduce the genomic context to the user parameter '--WindowSize'
@@ -398,7 +418,6 @@ def run(PROTEINS, TARGETS, GCUSER, GAP, CUTOFF, ADVANCEDSETTINGSFILENAME):
398418
PROTEINS, common.getProteinsFamiliesStepSchema())
399419
targets_info = common.readJSON(
400420
TARGETS, common.getTargetsTaxonomyStepschema())
401-
402421
targets_syntons = {}
403422
no_synteny = 0
404423
# logger.info('Length of the targets list: {}'.format(len(targets_info)))
@@ -461,8 +480,10 @@ def run(PROTEINS, TARGETS, GCUSER, GAP, CUTOFF, ADVANCEDSETTINGSFILENAME):
461480

462481
# Walktrap clustering
463482
logger.info('Walktrap clustering...')
483+
print("graph {}".format(maxi_graph))
464484
graph_walktrap = maxi_graph.community_walktrap(
465485
weights='weight', steps=advanced_settings[common.global_dict['WalkTrap']]['walktrap_step'])
486+
maxi_graph, graph_walktrap = fix_dendrogram(maxi_graph, graph_walktrap)
466487
walktrap_clustering = graph_walktrap.as_clustering()
467488

468489
for cluster in range(len(walktrap_clustering)):
@@ -521,7 +542,8 @@ def run(PROTEINS, TARGETS, GCUSER, GAP, CUTOFF, ADVANCEDSETTINGSFILENAME):
521542
nxGraph.add_weighted_edges_from([(names[x[0]], names[x[1]], maxi_graph.es[maxi_graph.get_eid(
522543
x[0], x[1])]['weight']) for x in maxi_graph.get_edgelist()])
523544

524-
matrix_adjacency = nx.to_scipy_sparse_matrix(nxGraph, weight='weight')
545+
matrix_adjacency = nx.to_scipy_sparse_array(nxGraph, weight='weight')
546+
#to_scipy_sparse_array remplace to_scipy_sparse_matrix
525547

526548
result = mc.run_mcl(
527549
matrix_adjacency,

0 commit comments

Comments
 (0)