KG microbe: TSV files into one RDF (turtle) file

Source: https://kg-hub.berkeleybop.io/kg-microbe/current/index.html
Bioregistry source: https://bioregistry.io/registry/

import csv
import json
import re
import urllib.parse

Relative paths to the input and output files

edgesPath = "input/merged-kg_edges.tsv"
nodesPath = "input/merged-kg_nodes.tsv"
bioregistryPath = "input/registry.json"
outputPath = "output/kg-microbe.ttl"

All used prefixes are defined here

prefixes = {
    "biolink": "https://w3id.org/biolink/vocab/",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "owl": "http://www.w3.org/2002/07/owl#",
    "dc": "http://purl.org/dc/terms/",
    "obo": "http://purl.obolibrary.org/obo/",
    "oio": "http://www.geneontology.org/formats/oboInOwl#",
    "wd": "https://www.wikidata.org/wiki/",
    "bioregistry": "https://bioregistry.io/",
    "medi": "https://mediadive.dsmz.de/ingredients/",
    "meds": "https://mediadive.dsmz.de/solutions/",
    "medm": "https://mediadive.dsmz.de/medium/"
}

Load prefixes from registry.json which should be downloaded from https://bioregistry.io/registry/

bioregistry_prefixes = {}
f = open(bioregistryPath)
data = json.load(f)
for entity in data.values():
    bioregistry_prefixes[entity["prefix"]] = {"name": entity["name"], "uri_format": entity["uri_format"]}
f.close()
del data

Here we start to write prefixes into the output file

outputStream = open(outputPath, "w")

for p, ns in prefixes.items():
    outputStream.write(f"@prefix {p}: <{ns}> .\n")

outputStream.write("\n")

Helpers for saving triples and uri extraction. All uris are resolved from the iri columns and then from https://bioregistry.io/registry/ if iri is unknown. All unresolved uris are replaced with the following urn format: <urn:unknown:id>.

def add_triple(s: str, p: str, o: str):
    outputStream.write(f"{s} {p} {o} .\n")
    # print(f"{s} {p} {o} .")

def add_label(s: str, label: str):
    add_triple(s, "rdfs:label", json.dumps(label))

def add_type(s: str, t: str):
    add_triple(s, "rdf:type", t)

def add_synonym(s: str, syn: str):
    add_triple(s, "biolink:synonym", json.dumps(syn))
    
def add_reference(s: str, ref: str):
    add_triple(s, "dc:identifier", json.dumps(ref))

def extract_uri(id: str):
    uri_parts = id.split(":", 1)
    if len(uri_parts) == 1:
        encoded_id = urllib.parse.quote(id, safe="")
        return f"urn:unknown:{encoded_id}"
    elif len(uri_parts) > 1:
        prefix = uri_parts[0].lower()
        def_prefix = bioregistry_prefixes.get(prefix)
        if def_prefix is None:
            unprefixes.add(prefix)
            encoded_id = urllib.parse.quote(id, safe="")
            return f"urn:unknown:{encoded_id}"
        else:
            uf = def_prefix["uri_format"]  # type: str
            if uf is None or "$1" not in uf:
                return f"https://bioregistry.io/{prefix}:{uri_parts[1]}"
            else:
                return uf.replace("$1", uri_parts[1])
    return None

First, we add user-defined triples.

add_triple("biolink:synonym", "rdfs:label", "\"Synonym\"")
add_triple("dc:identifier", "rdfs:label", "\"Reference\"")

Nodes extraction into the output file

It includes:

labels rdfs:label
types rdf:type as biolink:<type>
synonyms biolink:synonym
reference dc:identifier with the entity uri.

i = 0
f_in = open(nodesPath, newline="")
reader = csv.reader(f_in, delimiter="\t")
rowsIt = iter(reader)
header = {k: v for v, k in enumerate(next(rowsIt))}
id_to_uri = {}
for row in rowsIt:
    i += 1
    if i % 50000 == 0: print(f"processed lines: {i}")
    uri = row[header["iri"]].split("|")[0].strip()
    id = row[header["id"]].strip()
    if not uri:
        uri = extract_uri(id)
    if not uri: continue
    puri = None
    for p, ns in prefixes.items():
        if uri.startswith(ns):
            puri = f"{p}:{re.sub("([~.!$&'\"()*+,;=/?#@%])", r"\\\1", uri.lstrip(ns))}"
            break
    s = puri if puri else f"<{uri}>"
    id_to_uri[id] = s
    n = str(row[header["name"]]).strip()
    if len(n) > 0: add_label(s, n)
    if not uri.startswith("urn:unknown:"):
        add_reference(s, uri)
    for syn in row[header["synonym"]].split("|"):
        syn = syn.strip()
        if len(syn) > 0 and syn != n: add_synonym(s, syn)
    for t in str(row[header['category']]).split("|"):
        t = t.strip()
        if len(t) > 0: add_type(s, t)
print(f"(Done) processed lines: {i}")
f_in.close()

Edges extraction into the output file

Triples with unknown uris with unknown prefix are skipped.

i = 0
f_in = open(edgesPath, newline="")
reader = csv.reader(f_in, delimiter="\t")
rowsIt = iter(reader)
header = {k: v for v, k in enumerate(next(rowsIt))}
unknown_predicates = set()
used_predicates = set()
for row in rowsIt:
    i += 1
    if i % 50000 == 0: print(f"processed lines: {i}")
    s = id_to_uri.get(row[header["subject"]].strip())
    o = id_to_uri.get(row[header["object"]].strip())
    p = row[header["predicate"]].strip()
    p_parts = p.split(":", 1)
    if s and o and len(p_parts) == 2 and p_parts[0] in prefixes:
        used_predicates.add(p)
        add_triple(s, p, o)
    else:
        unknown_predicates.add(p)
print(f"(Done) processed lines: {i}")
f_in.close()
if len(unknown_predicates) > 0:
    print(f"Unknown predicates: {unknown_predicates}")

Add labels of all used predicates into the output file.

for p in used_predicates:
    add_label(p, re.sub("([A-Z])", r"_\1", p.split(":", 1)[1]).replace("_", " ").lower().strip())

Close the output file writing.

outputStream.close()

Name		Name	Last commit message	Last commit date
Latest commit History 10 Commits
.gitignore		.gitignore
KgMicrobeToRdf.ipynb		KgMicrobeToRdf.ipynb
README.md		README.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

KG microbe: TSV files into one RDF (turtle) file

Nodes extraction into the output file

Edges extraction into the output file

About

Releases

Packages

Languages

KIZI/KgMicrobeToRdf

Folders and files

Latest commit

History

Repository files navigation

KG microbe: TSV files into one RDF (turtle) file

Nodes extraction into the output file

Edges extraction into the output file

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages