Modification uniprot query and correct bug in partial graph

baudstam · baudstam · commit afda8cf328e4 · 2022-07-28T09:59:55.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,8 @@ scripts_test/
 **/*__pycache__/
 .vscode/
 files_example/
+.gitignore
+build/
+netsyn.egg-info/
+netsyn/log
+dist/
diff --git a/README.md b/README.md
@@ -26,6 +26,20 @@ For the installation of MMseqs2 please refer to https://github.com/soedinglab/MM
 
   - biopython
 
+  - requests
+
+You can easely install using an virtual environment (command lines example below):
+virtualenv venv_netsyn
+source venv_netsyn/bin/activate
+pip install pyyaml
+pip install python-igraph
+pip install jsonschema
+pip install networkx
+pip install markov_clustering
+pip install urllib3
+git clone https://github.com/labgem/netsyn
+python3 setup.py install
+
 ## Basic Usage
 
 NetSyn can be used with 2 different input file formats. One is a file containing a list of UniProt accessions (`-u` option), while the other one is a correspondences file (`-c` option). The two types of file are described in the Input Data part. It is possible to start an analysis with both input file formats. It leads to 3 NetSyn basic usage callings:
diff --git a/netsyn/__init__.py b/netsyn/__init__.py
@@ -1,3 +1,3 @@
 #!/usr/bin/env python3
 
-__version__ = '0.1.0'
+__version__ = '0.1.1'
diff --git a/netsyn/netsyn_getINSDCFiles.py b/netsyn/netsyn_getINSDCFiles.py
@@ -12,11 +12,171 @@
 import time
 import urllib3
 import argparse
+import json
+import zlib
+from xml.etree import ElementTree
+from urllib.parse import urlparse, parse_qs, urlencode
+import requests
+from requests.adapters import HTTPAdapter, Retry
+
+# constants for uniprot query modification date 12/07/2022 
+# constant for uniprot query
+POLLING_INTERVAL = 3
+API_URL = "https://rest.uniprot.org"
+retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
+session = requests.Session()
+session.mount("https://", HTTPAdapter(max_retries=retries))
 
 #############
 # Functions #
 #############
 
+def submit_id_mapping(from_db, to_db, ids):
+    request = requests.post(
+        f"{API_URL}/idmapping/run",
+        data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
+    )
+    
+    request.raise_for_status()
+    session.close()
+    return request.json()["jobId"]
+
+
+def get_next_link(headers):
+    re_next_link = re.compile(r'<(.+)>; rel="next"')
+    if "Link" in headers:
+        match = re_next_link.match(headers["Link"])
+        if match:
+            return match.group(1)
+
+
+def check_id_mapping_results_ready(job_id):
+    while True:
+        request = session.get(f"{API_URL}/idmapping/status/{job_id}")
+        request.raise_for_status()
+        j = request.json()
+        if "jobStatus" in j:
+            if j["jobStatus"] == "RUNNING":
+#                print(f"Retrying in {POLLING_INTERVAL}s")
+                time.sleep(POLLING_INTERVAL)
+            else:
+                raise Exception(request["jobStatus"])
+        else:
+            session.close()
+            return bool(j["results"] or j["failedIds"])
+
+
+def get_batch(batch_response, file_format, compressed):
+    batch_url = get_next_link(batch_response.headers)
+    while batch_url:
+        batch_response = session.get(batch_url)
+        batch_response.raise_for_status()
+        yield decode_results(batch_response, file_format, compressed)
+        batch_url = get_next_link(batch_response.headers)
+
+
+def combine_batches(all_results, batch_results, file_format):
+    if file_format == "json":
+        for key in ("results", "failedIds"):
+            if key in batch_results and batch_results[key]:
+                all_results[key] += batch_results[key]
+    elif file_format == "tsv":
+        return all_results + batch_results[1:]
+    else:
+        return all_results + batch_results
+    return all_results
+
+
+def get_id_mapping_results_link(job_id):
+    url = f"{API_URL}/idmapping/details/{job_id}"
+    request = session.get(url)
+    request.raise_for_status()
+    return request.json()["redirectURL"]
+
+
+def decode_results(response, file_format, compressed):
+    if compressed:
+        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
+        if file_format == "json":
+            j = json.loads(decompressed.decode("utf-8"))
+            return j
+        elif file_format == "tsv":
+            return [line for line in decompressed.decode("utf-8").split("\n") if line]
+        elif file_format == "xlsx":
+            return [decompressed]
+        elif file_format == "xml":
+            return [decompressed.decode("utf-8")]
+        else:
+            return decompressed.decode("utf-8")
+    elif file_format == "json":
+        return response.json()
+    elif file_format == "tsv":
+        return [line for line in response.text.split("\n") if line]
+    elif file_format == "xlsx":
+        return [response.content]
+    elif file_format == "xml":
+        return [response.text]
+    return response.text
+
+
+def get_xml_namespace(element):
+    m = re.match(r"\{(.*)\}", element.tag)
+    return m.groups()[0] if m else ""
+
+
+def merge_xml_results(xml_results):
+    merged_root = ElementTree.fromstring(xml_results[0])
+    for result in xml_results[1:]:
+        root = ElementTree.fromstring(result)
+        for child in root.findall("{http://uniprot.org/uniprot}entry"):
+            merged_root.insert(-1, child)
+    ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
+    return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
+
+
+def print_progress_batches(batch_index, size, total):
+    n_fetched = min((batch_index + 1) * size, total)
+#    print(f"Fetched: {n_fetched} / {total}")
+
+
+def get_id_mapping_results_search(url):
+    parsed = urlparse(url)
+    query = parse_qs(parsed.query)
+    file_format = query["format"][0] if "format" in query else "json"
+    if "size" in query:
+        size = int(query["size"][0])
+    else:
+        size = 500
+        query["size"] = size
+    compressed = (
+        query["compressed"][0].lower() == "true" if "compressed" in query else False
+    )
+    parsed = parsed._replace(query=urlencode(query, doseq=True))
+    url = parsed.geturl()
+    request = session.get(url)
+    request.raise_for_status()
+    results = decode_results(request, file_format, compressed)
+    total = int(request.headers["x-total-results"])
+    print_progress_batches(0, size, total)
+    for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
+        results = combine_batches(results, batch, file_format)
+        print_progress_batches(i, size, total)
+    if file_format == "xml":
+        return merge_xml_results(results)
+    return results
+
+def get_id_mapping_results_stream(url):
+    if "/stream/" not in url:
+        url = url.replace("/results/", "/stream/")
+    request = session.get(url)
+    request.raise_for_status()
+    parsed = urlparse(url)
+    query = parse_qs(parsed.query)
+    file_format = query["format"][0] if "format" in query else "json"
+    compressed = (
+        query["compressed"][0].lower() == "true" if "compressed" in query else False
+    )
+    return decode_results(request, file_format, compressed)
 
 def resultsFormat(res, dico):
     '''
@@ -53,7 +213,7 @@ def resultsFormat(res, dico):
 
 def getENAidMatchingToUniProtid(uniprotAccessions, batchesSize, PoolManager):
     '''
-    Allows the correspondence between a UniProt accession and nuclotide accessions.
+    Allows the correspondence between a UniProt accession and nuclotide accessions.9
     Batch splitting to lighten the request.
     '''
     logger = logging.getLogger('{}.{}'.format(
@@ -63,14 +223,42 @@ def getENAidMatchingToUniProtid(uniprotAccessions, batchesSize, PoolManager):
     nbEntriesProcessed = 0
     logger.info(
         'Beginning of the correspondence between the UniProt and ENA identifiers...')
+    crossReference = {}
     while uniprotAccessions:
-        accessions = '+OR+id:'.join(uniprotAccessions[:batchesSize])
+        
+#        accessions = '+OR+accession_id:'.join(uniprotAccessions[:batchesSize])
 #        print("accesions {}".format(accessions))
-        res = common.httpRequest(
-            PoolManager, 'GET', 'https://www.uniprot.org/uniprot/?query=id:{}&columns=id,database(EMBL),database(EMBL_CDS)&format=tab'.format(accessions))
-        crossReference = resultsFormat(res, crossReference)
+#        res = common.httpRequest(
+#            PoolManager, 'GET', 'https://www.uniprot.org/uniprot/?query=id:{}&columns=id,database(EMBL),database(EMBL_CDS)&format=tab'.format(accessions))
+#            PoolManager, 'GET', 'https://rest.uniprot.org/uniprotkb/search?query=accession_id:{}&fields=accession,xref_embl&format=tsv'.format(accessions))
+#        crossReference = resultsFormat(res, crossReference)
 #        logger.info('uniprot {} crossref {}'.format(accessions, crossReference))
-        nbEntriesProcessed += len(uniprotAccessions[:batchesSize])
+#        nbEntriesProcessed += len(uniprotAccessions[:batchesSize])
+
+        accessions=uniprotAccessions[:batchesSize]
+        job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="EMBL-GenBank-DDBJ_CDS", ids=accessions)
+        if check_id_mapping_results_ready(job_id):
+            link = get_id_mapping_results_link(job_id)
+            results = get_id_mapping_results_search(link)
+            session.close()
+            data=results['results']
+            for row in data:
+                if row['from'] not in crossReference:
+                    crossReference[row['from']]={}
+                    crossReference[row['from']]['Cross-reference (embl)']=[]
+                    crossReference[row['from']]['Cross-reference (EMBL)']=[]
+                crossReference[row['from']]['Cross-reference (embl)'].append(row['to'])
+
+        job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="EMBL-GenBank-DDBJ", ids=accessions)
+        if check_id_mapping_results_ready(job_id):
+            link = get_id_mapping_results_link(job_id)
+            results = get_id_mapping_results_search(link)
+            session.close()
+            data=results['results']
+            for row in data:
+                 if row['from'] in crossReference:
+                     crossReference[row['from']]['Cross-reference (EMBL)'].append(row['to'])
+
         del uniprotAccessions[:batchesSize]
         logger.info(
             'Correspondences computed: {}/{}'.format(nbEntriesProcessed, nbTotalEntries))
@@ -131,7 +319,7 @@ def getEMBLfromENA(nucleicAccession, nucleicFilePath, PoolManager):
 
 def run(InputName):
     '''
-    Get INSDC files porocessing.
+    Get INSDC files processing.
     '''
     # Constants
     boxName = common.global_dict['boxName']['GetINSDCFiles']
@@ -153,7 +341,7 @@ def run(InputName):
         urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
         http = urllib3.PoolManager()
         crossReference = getENAidMatchingToUniProtid(
-            list(accessions), 250, http)
+            list(accessions), 200, http)
         withoutENAidNb = len(accessions)-len(crossReference)
         reportingMessages.append(
             'Targets without ENA correspondence number: {}/{}'.format(withoutENAidNb, len(accessions)))
@@ -293,6 +481,7 @@ def main():
         'inputClusteringStep', '{}_correspondences.tsv'.format(args.OutputName))
     common.global_dict.setdefault('files', {}).setdefault(boxName, {}).setdefault(
         'report', '{}_{}_report.txt'.format(args.OutputName, boxName))
+
     #######
     # Run #
     #######
diff --git a/netsyn/netsyn_syntenyFinder.py b/netsyn/netsyn_syntenyFinder.py
@@ -18,6 +18,26 @@
 # Functions #
 #############
 
+def fix_dendrogram(graph, cl):
+    already_merged = set()
+    for merge in cl.merges:
+        already_merged.update(merge)
+
+    num_dendrogram_nodes = graph.vcount() + len(cl.merges)
+    not_merged_yet = sorted(set(range(num_dendrogram_nodes)) - already_merged)
+    if len(not_merged_yet) < 2:
+        return (graph, cl)
+
+    v1, v2 = not_merged_yet[:2]
+    cl._merges.append((v1, v2))
+    del not_merged_yet[:2]
+
+    missing_nodes = range(num_dendrogram_nodes,
+            num_dendrogram_nodes + len(not_merged_yet))
+    cl._merges.extend(zip(not_merged_yet, missing_nodes))
+    cl._nmerges = graph.vcount()-1
+    return (graph, cl)
+
 
 def get_userGC(targets_info, windowSize):
     ''' reduce the genomic context to the user parameter '--WindowSize'
@@ -398,7 +418,6 @@ def run(PROTEINS, TARGETS, GCUSER, GAP, CUTOFF, ADVANCEDSETTINGSFILENAME):
         PROTEINS, common.getProteinsFamiliesStepSchema())
     targets_info = common.readJSON(
         TARGETS, common.getTargetsTaxonomyStepschema())
-
     targets_syntons = {}
     no_synteny = 0
     # logger.info('Length of the targets list: {}'.format(len(targets_info)))
@@ -461,8 +480,10 @@ def run(PROTEINS, TARGETS, GCUSER, GAP, CUTOFF, ADVANCEDSETTINGSFILENAME):
 
         # Walktrap clustering
         logger.info('Walktrap clustering...')
+        print("graph {}".format(maxi_graph))
         graph_walktrap = maxi_graph.community_walktrap(
             weights='weight', steps=advanced_settings[common.global_dict['WalkTrap']]['walktrap_step'])
+        maxi_graph, graph_walktrap = fix_dendrogram(maxi_graph, graph_walktrap)
         walktrap_clustering = graph_walktrap.as_clustering()
 
         for cluster in range(len(walktrap_clustering)):
@@ -521,7 +542,8 @@ def run(PROTEINS, TARGETS, GCUSER, GAP, CUTOFF, ADVANCEDSETTINGSFILENAME):
         nxGraph.add_weighted_edges_from([(names[x[0]], names[x[1]], maxi_graph.es[maxi_graph.get_eid(
             x[0], x[1])]['weight']) for x in maxi_graph.get_edgelist()])
 
-        matrix_adjacency = nx.to_scipy_sparse_matrix(nxGraph, weight='weight')
+        matrix_adjacency = nx.to_scipy_sparse_array(nxGraph, weight='weight')
+#to_scipy_sparse_array remplace to_scipy_sparse_matrix
 
         result = mc.run_mcl(
             matrix_adjacency,

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`#!/usr/bin/env python3`
`2`	`2`
`3`		`-__version__ = '0.1.0'`
	`3`	`+__version__ = '0.1.1'`