From 51fbb5d79b7d36b68885a10103a7368f8c324a22 Mon Sep 17 00:00:00 2001 From: AO33 Date: Thu, 14 Nov 2024 14:47:57 -0700 Subject: [PATCH] update to move go annotations to modularized version --- src/monarch_ingest/download.yaml | 72 --- src/monarch_ingest/ingests.yaml | 7 +- src/monarch_ingest/ingests/go/annotation.py | 122 ----- src/monarch_ingest/ingests/go/annotation.yaml | 92 ---- .../ingests/go/annotation_utils.py | 181 ------- .../ingests/go/gaf-eco-mapping.yaml | 47 -- src/monarch_ingest/ingests/go/metadata.yaml | 7 - tests/unit/go/test_go_annotation.py | 509 ------------------ 8 files changed, 5 insertions(+), 1032 deletions(-) delete mode 100644 src/monarch_ingest/ingests/go/annotation.py delete mode 100644 src/monarch_ingest/ingests/go/annotation.yaml delete mode 100644 src/monarch_ingest/ingests/go/annotation_utils.py delete mode 100644 src/monarch_ingest/ingests/go/gaf-eco-mapping.yaml delete mode 100644 src/monarch_ingest/ingests/go/metadata.yaml delete mode 100644 tests/unit/go/test_go_annotation.py diff --git a/src/monarch_ingest/download.yaml b/src/monarch_ingest/download.yaml index e134941d..e2372afa 100644 --- a/src/monarch_ingest/download.yaml +++ b/src/monarch_ingest/download.yaml @@ -198,78 +198,6 @@ local_name: data/flybase/entity_publication_fb.tsv.gz tag: flybase_publication_to_gene -### GO - -# Homo sapiens (human) -- url: http://current.geneontology.org/annotations/goa_human.gaf.gz - local_name: data/go/9606.go_annotations.gaf.gz - tag: go_annotation - -# Mus musculus (house mouse) -- url: http://current.geneontology.org/annotations/mgi.gaf.gz - local_name: data/go/10090.go_annotations.gaf.gz - tag: go_annotation - -# Rattus norvegicus (Norway rat) -- url: http://current.geneontology.org/annotations/rgd.gaf.gz - local_name: data/go/10116.go_annotations.gaf.gz - tag: go_annotation - -# Canis lupus familiaris (dog) -- url: http://current.geneontology.org/annotations/goa_dog.gaf.gz - local_name: data/go/9615.go_annotations.gaf.gz - tag: go_annotation - -# Bos taurus (cow) -- url: http://current.geneontology.org/annotations/goa_cow.gaf.gz - local_name: data/go/9913.go_annotations.gaf.gz - tag: go_annotation - -# Sus scrofa (pig) -- url: http://current.geneontology.org/annotations/goa_pig.gaf.gz - local_name: data/go/9823.go_annotations.gaf.gz - tag: go_annotation - -# Gallus gallus (chicken) -- url: http://current.geneontology.org/annotations/goa_chicken.gaf.gz - local_name: data/go/9031.go_annotations.gaf.gz - tag: go_annotation - -# Danio rerio (Zebrafish) -- url: http://current.geneontology.org/annotations/zfin.gaf.gz - local_name: data/go/7955.go_annotations.gaf.gz - tag: go_annotation - -# Drosophila melanogaster (fruit fly) -- url: http://current.geneontology.org/annotations/fb.gaf.gz - local_name: data/go/7227.go_annotations.gaf.gz - tag: go_annotation - -# Caenorhabditis elegans (nematodes) -- url: http://current.geneontology.org/annotations/wb.gaf.gz - local_name: data/go/6239.go_annotations.gaf.gz - tag: go_annotation - -# Dictyostelium discoideum -- url: http://current.geneontology.org/annotations/dictybase.gaf.gz - local_name: data/go/44689.go_annotations.gaf.gz - tag: go_annotation - -# Various species in the Aspergillus genus -- url: http://current.geneontology.org/annotations/aspgd.gaf.gz - local_name: data/go/5052.go_annotations.gaf.gz - tag: go_annotation - -# Saccharomyces cerevisiae (baker's yeast) -- url: http://current.geneontology.org/annotations/sgd.gaf.gz - local_name: data/go/4932.go_annotations.gaf.gz - tag: go_annotation - -# Schizosaccharomyces pombe -- url: http://current.geneontology.org/annotations/pombase.gaf.gz - local_name: data/go/4896.go_annotations.gaf.gz - tag: go_annotation - # HGNC - url: http://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt diff --git a/src/monarch_ingest/ingests.yaml b/src/monarch_ingest/ingests.yaml index 45d5716c..6c8d55fb 100644 --- a/src/monarch_ingest/ingests.yaml +++ b/src/monarch_ingest/ingests.yaml @@ -32,6 +32,11 @@ ncbi_gene: zfin_genotype_to_phenotype: url: - 'https://github.com/monarch-initiative/zfin-genotype-to-phenotype-ingest/releases/latest/download/zfin_genotype_to_phenotype_edges.tsv' +go_annotation: + url: + - 'https://github.com/monarch-initiative/go-ingest/releases/latest/download/go_annotation_edges.tsv' + + ## Ingests within this repository @@ -51,8 +56,6 @@ dictybase_gene: config: 'ingests/dictybase/gene.yaml' dictybase_gene_to_phenotype: config: 'ingests/dictybase/gene_to_phenotype.yaml' -go_annotation: - config: 'ingests/go/annotation.yaml' hgnc_gene: config: 'ingests/hgnc/gene.yaml' panther_genome_orthologs: diff --git a/src/monarch_ingest/ingests/go/annotation.py b/src/monarch_ingest/ingests/go/annotation.py deleted file mode 100644 index 80049b9f..00000000 --- a/src/monarch_ingest/ingests/go/annotation.py +++ /dev/null @@ -1,122 +0,0 @@ -""" -Gene Ontology Annotations Ingest module. - -Gene to GO term Associations -(to MolecularActivity, BiologicalProcess and CellularComponent) -""" - -import uuid - -from biolink_model.datamodel.pydanticmodel_v2 import KnowledgeLevelEnum, AgentTypeEnum -from koza.cli_utils import get_koza_app - -from monarch_ingest.ingests.go.annotation_utils import ( - parse_identifiers, - get_biolink_classes, - lookup_predicate, - get_infores, -) -from loguru import logger - -koza_app = get_koza_app("go_annotation") - -# for row in koza_app.source: # doesn't play nice with tests -while (row := koza_app.get_row()) is not None: - - gene_id, ncbitaxa = parse_identifiers(row) - - # Grab the Gene Ontology ID - go_id = row['GO_ID'] - - # Discern GO identifier 'aspect'' this term belongs to: - # 'F' == molecular_function - child of GO:0003674 - # 'P' == biological_process - child of GO:0008150 - # 'C' == cellular_component - child of GO:0005575 - go_aspect: str = row['Aspect'] - if not (go_aspect and go_aspect.upper() in ["F", "P", "C"]): - logger.warning(f"GAF Aspect '{str(go_aspect)}' is empty or unrecognized? Skipping record") - - else: - # Decipher the GO Evidence Code - evidence_code = row['Evidence_Code'] - eco_term = None - - if evidence_code and evidence_code in koza_app.translation_table.local_table: - eco_term = koza_app.translation_table.local_table[evidence_code] - - if not eco_term: - logger.warning(f"GAF Evidence Code '{str(evidence_code)}' is empty or unrecognized? Tagging as 'ND'") - eco_term = "ECO:0000307" - - # Association predicate is normally NOT negated - # except as noted below in the GAF qualifier field - negated = False - - # For root node annotations that use the ND evidence code should be used: - # - # molecular_function (GO:0003674) enables (RO:0002327) - # biological_process (GO:0008150) involved_in (RO:0002331) - # cellular_component (GO:0005575) is_active_in (RO:0002432) - # - predicate = None - if go_id == "GO:0003674" and eco_term == "ECO:0000307": - qualifier = "enables" - elif go_id == "GO:0008150" and eco_term == "ECO:0000307": - qualifier = "involved_in" - elif go_id == "GO:0005575" and eco_term == "ECO:0000307": - qualifier = "is_active_in" - else: - # The Association Predicate is otherwise inferred from the GAF 'Qualifier' used. - # Note that this qualifier may be negated (i.e. "NOT|"). - qualifier = row['Qualifier'] - - if qualifier: - # check for piped negation prefix (hopefully, well behaved!) - qualifier_parts = qualifier.split("|") - if qualifier_parts[0] == "NOT": - predicate = lookup_predicate(qualifier_parts[1]) - negated = True - else: - predicate = lookup_predicate(qualifier_parts[0]) - else: - # If qualifier missing, assign a default predicate - # a.k.a. predicate based on specified GO Aspect type - logger.error("GAF record is missing its qualifier...assigning default qualifier as per GO term Aspect") - if go_aspect == "F": - predicate = "enables" - elif go_aspect == "P": - predicate = "involved_in" - elif go_aspect == "C": - predicate = "located_in" - - if not predicate: - logger.error(f"GAF Qualifier '{str(qualifier)}' is unrecognized? Skipping the record...") - - else: - - # Retrieve the GO aspect related NamedThing category-associated 'node' and Association 'edge' classes - go_concept_node_class, gene_go_term_association_class = get_biolink_classes(go_aspect) - - # actual primary knowledge source of the GOA knowledge statement - assigned_by = get_infores(row['Assigned_By']) - - publications = [p.replace('MGI:MGI:','MGI:') for p in row['DB_Reference'].split("|")] if row['DB_Reference'] else [] - - # Instantiate the appropriate Gene-to-GO Term instance - association = gene_go_term_association_class( - id="uuid:" + str(uuid.uuid1()), - subject=gene_id, - object=go_id, - predicate=predicate, - negated=negated, - has_evidence=[eco_term], - publications=publications, - # subject_context_qualifier=ncbitaxa, # Biolink Pydantic model support missing for this slot - aggregator_knowledge_source=["infores:monarchinitiative"], - primary_knowledge_source=assigned_by, - knowledge_level=KnowledgeLevelEnum.knowledge_assertion, - agent_type=AgentTypeEnum.manual_agent, - ) - - # Write the captured Association out - koza_app.write(association) diff --git a/src/monarch_ingest/ingests/go/annotation.yaml b/src/monarch_ingest/ingests/go/annotation.yaml deleted file mode 100644 index 402fca39..00000000 --- a/src/monarch_ingest/ingests/go/annotation.yaml +++ /dev/null @@ -1,92 +0,0 @@ -name: 'go_annotation' - -format: 'csv' # is 'gaf' specifically recognized, or does it need to be specifically recognized? -delimiter: '\t' -header: none -comment_char: '!' - -files: - # Need to filter out 5052 - Contains data from multiple species of the - # genus Aspergillus, taxon:5052, not just Aspergillus nidulans - - './data/go/4932.go_annotations.gaf.gz' - - './data/go/4896.go_annotations.gaf.gz' - - './data/go/5052.go_annotations.gaf.gz' - - './data/go/6239.go_annotations.gaf.gz' - - './data/go/7227.go_annotations.gaf.gz' - - './data/go/7955.go_annotations.gaf.gz' - - './data/go/9031.go_annotations.gaf.gz' - - './data/go/9606.go_annotations.gaf.gz' - - './data/go/9615.go_annotations.gaf.gz' - - './data/go/9823.go_annotations.gaf.gz' - - './data/go/9913.go_annotations.gaf.gz' - - './data/go/10090.go_annotations.gaf.gz' - - './data/go/10116.go_annotations.gaf.gz' - - './data/go/44689.go_annotations.gaf.gz' - # - './data/go/162425.go_annotations.gaf.gz' - -filters: - - inclusion: 'include' - column: 'Taxon' - filter_code: 'in' - value: - - 'taxon:9606' - - 'taxon:10090' - - 'taxon:10116' - - 'taxon:9615' - - 'taxon:9913' - - 'taxon:9823' - - 'taxon:9031' - - 'taxon:7955' - - 'taxon:7227' - - 'taxon:6239' - - 'taxon:44689' - - 'taxon:4932' - - 'taxon:4896' - # Aspergillus nidulans FGSC A4 - primary genomic reference - # for Aspergillus nidulanse, - # data inside 5052.go_annotations.gaf.gz - - 'taxon:227321' - - 'taxon:559292' - -metadata: !include ./src/monarch_ingest/ingests/go/metadata.yaml - -global_table: './src/monarch_ingest/translation_table.yaml' - -# Evidence Code to ECO term mappings file -local_table: './src/monarch_ingest/ingests/go/gaf-eco-mapping.yaml' - -# http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/ -columns: - - DB - - DB_Object_ID - - DB_Object_Symbol - - Qualifier - - GO_ID - - DB_Reference - - Evidence_Code - - With_or_From - - Aspect - - DB_Object_Name - - DB_Object_Synonym - - DB_Object_Type - - Taxon - - Date - - Assigned_By - - Annotation_Extension - - Gene_Product_Form_ID - -edge_properties: - - 'id' - - 'category' - - 'subject' - - 'predicate' - - 'negated' - - 'has_evidence' - - 'object' - - 'publications' - - 'aggregator_knowledge_source' - - 'primary_knowledge_source' - - 'knowledge_level' - - 'agent_type' - -transform_mode: 'flat' diff --git a/src/monarch_ingest/ingests/go/annotation_utils.py b/src/monarch_ingest/ingests/go/annotation_utils.py deleted file mode 100644 index fd9d3c9e..00000000 --- a/src/monarch_ingest/ingests/go/annotation_utils.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -Some Gene Ontology Annotation ingest utility functions. -""" - -from re import sub, IGNORECASE, compile, Pattern -from typing import Optional, Tuple, List, Dict - -from loguru import logger - -from biolink_model.datamodel.pydanticmodel_v2 import ( - BiologicalProcess, - CellularComponent, - MacromolecularMachineToBiologicalProcessAssociation, - MacromolecularMachineToCellularComponentAssociation, - MacromolecularMachineToMolecularActivityAssociation, - MolecularActivity, -) - - -def parse_ncbi_taxa(taxon: str) -> List[str]: - ncbi_taxa: List[str] = list() - if taxon: - # in rare circumstances, multiple taxa may be given as a piped list... - taxa = taxon.split("|") - for taxon in taxa: - ncbi_taxa.append(sub(r"^taxon", "NCBITaxon", taxon, flags=IGNORECASE)) - return ncbi_taxa - else: - return [] - - -_gene_identifier_map: Dict[str, Tuple[str, Pattern]] = { - # Genome sequenced model for - # Aspergillus nidulans FGSC A4 a.k.a. Emericella nidulans - # The proper CURIE prefix for this is not certain - "NCBITaxon:227321": ('AspGD', compile(r"(?PAN\d+)\|")) -} - - -def parse_identifiers(row: Dict): - """ - This method uses specific fields of the GOA data entry - to resolve both the gene identifier and the NCBI Taxon - """ - db: str = row['DB'] - db_object_id: str = row['DB_Object_ID'] - - # This check is to clean up id's like MGI:MGI:123 - if ":" in db_object_id: - db_object_id = db_object_id.split(':')[-1] - - ncbitaxa: List[str] = parse_ncbi_taxa(row['Taxon']) - if not ncbitaxa: - # Unlikely to happen, but... - logger.warning(f"Missing taxa for '{db}:{db_object_id}'?") - - # Hacky remapping of some gene identifiers - if ncbitaxa[0] in _gene_identifier_map.keys(): - id_regex: Pattern = _gene_identifier_map[ncbitaxa[0]][1] - aliases: str = row['DB_Object_Synonym'] - match = id_regex.match(aliases) - if match is not None: - # Overwrite the 'db' and 'db_object_id' accordingly - db = _gene_identifier_map[ncbitaxa[0]][0] - db_object_id = match.group('identifier') - - gene_id: str = f"{db}:{db_object_id}" - - return gene_id, ncbitaxa - - -# TODO: replace this workaround dictionary with direct usage of the -# Pydantic Predicate functionality and the translator_table.yaml -# Or an external local table file? -_predicate_by_name = { - "enables": {"predicate": "biolink:enables", "mapping": "RO:0002327"}, - "involved_in": {"predicate": "biolink:actively_involved_in", "mapping": "RO:0002331"}, - "located_in": {"predicate": "biolink:located_in", "mapping": "RO:0001025"}, - "contributes_to": {"predicate": "biolink:contributes_to", "mapping": "RO:0002326"}, - "acts_upstream_of": { - "predicate": "biolink:acts_upstream_of", - "mapping": "RO:0002263", - }, - "part_of": {"predicate": "biolink:part_of", "mapping": "BFO:0000050"}, - "acts_upstream_of_positive_effect": { - "predicate": "biolink:acts_upstream_of_positive_effect", - "mapping": "RO:0004034", - }, - "is_active_in": {"predicate": "biolink:active_in", "mapping": "RO:0002432"}, - "acts_upstream_of_negative_effect": { - "predicate": "biolink:acts_upstream_of_negative_effect", - "mapping": "RO:0004035", - }, - "colocalizes_with": { - "predicate": "biolink:colocalizes_with", - "mapping": "RO:0002325", - }, - "acts_upstream_of_or_within": { - "predicate": "biolink:acts_upstream_of_or_within", - "mapping": "RO:0002264", - }, - "acts_upstream_of_or_within_positive_effect": { - "predicate": "biolink:acts_upstream_of_or_within_positive_effect", - "mapping": "RO:0004032", - }, - "acts_upstream_of_or_within_negative_effect": { - "predicate": "biolink:acts_upstream_of_or_within_negative_effect", - "mapping": "RO:0004033", - }, -} - - -def lookup_predicate(name: str = None) -> Optional[str]: # see 'return' comment below Optional[Tuple[str, Any]]: - """ - :param name: string name of predicate to be looked up - :return: tuple(biolink:predicate, mapping to relation) if available; None otherwise - """ - if name and name in _predicate_by_name: - entry = _predicate_by_name[name] - else: - logger.error(f"Encountered unknown GAF qualifier '{str(name)}'?") - return None - - # Oct 2023 - we don't care about the RO relation anymore - # return entry["predicate"], entry["mapping"] - return entry["predicate"] - - -_biolink_class_by_go_aspect = { - "F": (MolecularActivity, MacromolecularMachineToMolecularActivityAssociation), - "P": (BiologicalProcess, MacromolecularMachineToBiologicalProcessAssociation), - "C": (CellularComponent, MacromolecularMachineToCellularComponentAssociation), -} - - -def get_biolink_classes(go_aspect: str) -> Tuple: - """ - Return a tuple of the Biolink Model Pydantic implementation of the - NamedThing category-associated 'node' and Association 'edge' classes - mapping onto the specified Gene Ontology 'aspect': - one of P (biological process), F (molecular function) or C (cellular component). - - :param go_aspect: single character code of GO aspect - :return: (category, association) tuple of Biolink Model Pydantic classes associated with the given GO aspect - """ - return _biolink_class_by_go_aspect[go_aspect.upper()] - - -# -# See comment below -# -# INFORES_OBJECT_ID_MAP = { -# "MGI": "mgi", -# "AspGD": "aspgd", -# "UniProt": "uniprot", -# } - - -def get_infores(source: Optional[str]) -> Optional[str]: - - if source: - # Our original vision was to do a proper mapping to known InfoRes CURIEs - # in the "Translator" # infores catalog but since the GOA file has such - # weird diversity in 'Assigned By' source values, one gets the sense that, - # for now, it is adequate to simply build a sensible infores "just-in-time". - - # if source not in INFORES_OBJECT_ID_MAP: - # INFORES_OBJECT_ID_MAP[source] = str(source).lower() - # # logger.warning( - # # f"Encountered source '{source}'?" + - # # f" Inferring 'infores' as {INFORES_OBJECT_ID_MAP[source]}" - # # ) - # print(INFORES_OBJECT_ID_MAP[source], flush=True) - # return f"infores:{INFORES_OBJECT_ID_MAP[source]}" - - # Reformat the source string to infores reference ID syntax standards - infores_id = str(source).strip().lower().replace("_", "-") - # Then prepend the namespace... - return f"infores:{infores_id}" - - return None diff --git a/src/monarch_ingest/ingests/go/gaf-eco-mapping.yaml b/src/monarch_ingest/ingests/go/gaf-eco-mapping.yaml deleted file mode 100644 index 27ff6cbf..00000000 --- a/src/monarch_ingest/ingests/go/gaf-eco-mapping.yaml +++ /dev/null @@ -1,47 +0,0 @@ ---- -"ISS-GO_REF:0000012": "ECO:0000031" -"ISS-GO_REF:0000027": "ECO:0000031" -"RCA": "ECO:0000245" -"ISA": "ECO:0000247" -"ISS": "ECO:0000250" -"ISM": "ECO:0000255" -"ISS-GO_REF:0000011": "ECO:0000255" -"IEA-GO_REF:0000002": "ECO:0000256" -"IEA-GO_REF:0000107": "ECO:0000256" -"IEA-GO_REF:0000019": "ECO:0000265" -"IEA-GO_REF:0000020": "ECO:0000265" -"IEA-GO_REF:0000035": "ECO:0000265" -"IEA-GO_REF:0000049": "ECO:0000265" -"ISO": "ECO:0000266" -"EXP": "ECO:0000269" -"IEP": "ECO:0000270" -"NAS": "ECO:0000303" -"TAS": "ECO:0000304" -"IC": "ECO:0000305" -"ND": "ECO:0000307" -"IDA": "ECO:0000314" -"IMP": "ECO:0000315" -"IGI": "ECO:0000316" -"IGC": "ECO:0000317" -"IBA": "ECO:0000318" -"IBD": "ECO:0000319" -"IKR": "ECO:0000320" -"IMR": "ECO:0000320" -"IRD": "ECO:0000321" -"IEA-GO_REF:0000037": "ECO:0000322" -"IEA-GO_REF:0000039": "ECO:0000322" -"IEA-GO_REF:0000041": "ECO:0000322" -"IEA-GO_REF:0000038": "ECO:0000323" -"IEA-GO_REF:0000040": "ECO:0000323" -"IPI": "ECO:0000353" -"IGC-GO_REF:0000025": "ECO:0000354" -"IEA-GO_REF:0000108": "ECO:0000363" -"IEA": "ECO:0000501" -"IEA-GO_REF:0000003": "ECO:0000501" -"IEA-GO_REF:0000004": "ECO:0000501" -"IEA-GO_REF:0000023": "ECO:0000501" -"HTP": "ECO:0006056" -"HMP": "ECO:0007001" -"HGI": "ECO:0007003" -"HDA": "ECO:0007005" -"HEP": "ECO:0007007" diff --git a/src/monarch_ingest/ingests/go/metadata.yaml b/src/monarch_ingest/ingests/go/metadata.yaml deleted file mode 100644 index 3e545bfa..00000000 --- a/src/monarch_ingest/ingests/go/metadata.yaml +++ /dev/null @@ -1,7 +0,0 @@ -name: 'GO' - -dataset_description: - ingest_title: 'GO' - ingest_url: 'http://geneontology.org/' - description: 'Gene Ontology Annotations Database' - rights: 'http://geneontology.org/docs/go-citation-policy/' diff --git a/tests/unit/go/test_go_annotation.py b/tests/unit/go/test_go_annotation.py deleted file mode 100644 index 7c21eaa4..00000000 --- a/tests/unit/go/test_go_annotation.py +++ /dev/null @@ -1,509 +0,0 @@ -""" -Unit tests for GO Annotations ingest -""" - -from typing import Tuple - -import pytest -from biolink_model.datamodel.pydanticmodel_v2 import Association -from koza.utils.testing_utils import mock_koza # noqa: F401 # noqa: F401 -from loguru import logger - -from monarch_ingest.ingests.go.annotation_utils import parse_identifiers - - -@pytest.mark.parametrize( - "query", - [ - ( - { - "DB": "AspGD", - "DB_Object_ID": "ASPL0000057967", - "DB_Object_Symbol": "catB", - "Qualifier": "acts_upstream_of_or_within", - "GO_ID": "GO:0019521", # D-gluconate metabolic process - "DB_Reference": "AspGD_REF:ASPL0000080002|PMID:18405346", - "Evidence_Code": "RCA", - "With_or_From": "", - "Aspect": "P", - "DB_Object_Name": "", - "DB_Object_Synonym": "AN9339|ANID_09339|ANIA_09339", - "DB_Object_Type": "gene_product", - "Taxon": "taxon:227321", - "Date": "20090403", - "Assigned_By": "AspGD", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - "AspGD:AN9339", - "NCBITaxon:227321", - ) - ], -) -def test_parse_identifiers(query: Tuple): - gene_id, ncbitaxa = parse_identifiers(query[0]) - assert gene_id == query[1] - assert query[2] in ncbitaxa - - -@pytest.fixture -def source_name(): - """ - :return: string source name of GO Annotations ingest - """ - return "go_annotation" - - -@pytest.fixture -def script(): - """ - :return: string path to GO Annotations ingest script - """ - return "./src/monarch_ingest/ingests/go/annotation.py" - - -@pytest.fixture(scope="package") -def local_table(): - """ - :return: string path to Evidence Code to ECO term mappings file - """ - return "src/monarch_ingest/ingests/go/gaf-eco-mapping.yaml" - - -@pytest.fixture -def test_rows(): - """ - :return: List of test GO Annotation data rows (realistic looking but synthetic data). - """ - return [ - # Core data test: a completely normal record - { - "DB": "UniProtKB", - "DB_Object_ID": "A0A024RBG1", - "DB_Object_Symbol": "NUDT4B", - "Qualifier": "enables", - "GO_ID": "GO:0003723", # molecular_function: RNA binding - "DB_Reference": "GO_REF:0000043", - "Evidence_Code": "IEA", - "With_or_From": "UniProtKB-KW:KW-0694", - "Aspect": "F", - "DB_Object_Name": "Diphosphoinositol polyphosphate phosphohydrolase", - "DB_Object_Synonym": "NUDT4B", - "DB_Object_Type": "protein", - "Taxon": "taxon:9606", - "Date": "20211010", - "Assigned_By": "UniProt", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - # Multiple taxa - { - "DB": "WB", - "DB_Object_ID": "WBGene00000013", - "DB_Object_Symbol": "abf-2", - "Qualifier": "involved_in", - "GO_ID": "GO:0050830", - "DB_Reference": "WB_REF:WBPaper00045314|PMID:24882217", - "Evidence_Code": "IEP", - "With_or_From": "", - "Aspect": "P", - "DB_Object_Name": "", - "DB_Object_Synonym": "C50F2.10|C50F2.e", - "DB_Object_Type": "gene", - "Taxon": "taxon:6239|taxon:46170", - "Date": "20140827", - "Assigned_By": "WB", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - # Test default qualifier override for molecular function - { - "DB": "UniProtKB", - "DB_Object_ID": "A0A024RBG2", - "DB_Object_Symbol": "NUDT4B", - "Qualifier": "contributes_to", - "GO_ID": "GO:0003674", # molecular_function root - "DB_Reference": "GO_REF:0003674", - "Evidence_Code": "ND", - "With_or_From": "UniProtKB-KW:KW-0694", - "Aspect": "F", - "DB_Object_Name": "Diphosphoinositol polyphosphate phosphohydrolase", - "DB_Object_Synonym": "NUDT4B", - "DB_Object_Type": "protein", - "Taxon": "taxon:9606", - "Date": "20211010", - "Assigned_By": "UniProt", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - # Test default qualifier override for biological process - { - "DB": "UniProtKB", - "DB_Object_ID": "A0A024RBG3", - "DB_Object_Symbol": "NUDT4B", - "Qualifier": "acts_upstream_of_negative_effect", - "GO_ID": "GO:0008150", # biological_process - "DB_Reference": "GO_REF:0008150", - "Evidence_Code": "ND", - "With_or_From": "UniProtKB-KW:KW-0694", - "Aspect": "P", - "DB_Object_Name": "Diphosphoinositol polyphosphate phosphohydrolase", - "DB_Object_Synonym": "NUDT4B", - "DB_Object_Type": "protein", - "Taxon": "taxon:4932", - "Date": "20211010", - "Assigned_By": "UniProt", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - # Test default qualifier override for cellular compartment - { - "DB": "UniProtKB", - "DB_Object_ID": "A0A024RBG4", - "DB_Object_Symbol": "NUDT4B", - "Qualifier": "colocalizes_with", - "GO_ID": "GO:0005575", # cellular compartment - "DB_Reference": "GO_REF:0005575", - "Evidence_Code": "ND", - "With_or_From": "UniProtKB-KW:KW-0694", - "Aspect": "C", - "DB_Object_Name": "Diphosphoinositol polyphosphate phosphohydrolase", - "DB_Object_Synonym": "NUDT4B", - "DB_Object_Type": "protein", - "Taxon": "taxon:4932", - "Date": "20211010", - "Assigned_By": "UniProt", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - # Test non-default Biological Process and non-default qualifier - { - "DB": "UniProtKB", - "DB_Object_ID": "Q6GZX3", - "DB_Object_Symbol": "NUDT4B", - "Qualifier": "acts_upstream_of_or_within", - "GO_ID": "GO:0045759", - "DB_Reference": "GO_REF:0045759", - "Evidence_Code": "ND", - "With_or_From": "UniProtKB-KW:KW-0694", - "Aspect": "P", - "DB_Object_Name": "Diphosphoinositol polyphosphate phosphohydrolase", - "DB_Object_Synonym": "NUDT4B", - "DB_Object_Type": "protein", - "Taxon": "taxon:1000", - "Date": "20211010", - "Assigned_By": "UniProt", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - # Test outcome of unknown UniProt idmapping: uniprot id - # is returned as gene id? Also try another evidence code - { - "DB": "UniProtKB", - "DB_Object_ID": "A0A024RBG5", - "DB_Object_Symbol": "NUDT4B", - "Qualifier": "enables", - "GO_ID": "GO:0003723", # molecular_function: RNA binding - "DB_Reference": "GO_REF:0000043", - "Evidence_Code": "HMP", - "With_or_From": "UniProtKB-KW:KW-0694", - "Aspect": "F", - "DB_Object_Name": "Diphosphoinositol polyphosphate phosphohydrolase", - "DB_Object_Synonym": "NUDT4B", - "DB_Object_Type": "protein", - "Taxon": "taxon:9606", - "Date": "20211010", - "Assigned_By": "UniProt", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - # Test non-default Biological Process with negated qualifier - { - "DB": "UniProtKB", - "DB_Object_ID": "Q6GZX0", - "DB_Object_Symbol": "NUDT4B", - "Qualifier": "NOT|acts_upstream_of_or_within", - "GO_ID": "GO:0045759", - "DB_Reference": "GO_REF:0045759", - "Evidence_Code": "ND", - "With_or_From": "UniProtKB-KW:KW-0694", - "Aspect": "P", - "DB_Object_Name": "Diphosphoinositol polyphosphate phosphohydrolase", - "DB_Object_Synonym": "NUDT4B", - "DB_Object_Type": "protein", - "Taxon": "taxon:1000", - "Date": "20211010", - "Assigned_By": "UniProt", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - # Missing (or wrong) GO term Aspect value - the record will be skipped? - # So no entry is needed in the result_expected dictionary below - { - "DB": "UniProtKB", - "DB_Object_ID": "Q6GZX0", - "DB_Object_Symbol": "NUDT4B", - "Qualifier": "acts_upstream_of_or_within", - "GO_ID": "GO:0045759", - "DB_Reference": "GO_REF:0045759", - "Evidence_Code": "IEA", - "With_or_From": "UniProtKB-KW:KW-0694", - "Aspect": "", - "DB_Object_Name": "Diphosphoinositol polyphosphate phosphohydrolase", - "DB_Object_Synonym": "NUDT4B", - "DB_Object_Type": "protein", - "Taxon": "taxon:1000", - "Date": "20211010", - "Assigned_By": "UniProt", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - # Missing (empty) qualifier - assign GO Aspect associated default - { - "DB": "UniProtKB", - "DB_Object_ID": "A0A024RBG8", - "DB_Object_Symbol": "NUDT4B", - "Qualifier": "", - "GO_ID": "GO:0005575", # cellular compartment - "DB_Reference": "GO_REF:0005575", - "Evidence_Code": "IEA-GO_REF:0000041", - "With_or_From": "UniProtKB-KW:KW-0694", - "Aspect": "C", - "DB_Object_Name": "Diphosphoinositol polyphosphate phosphohydrolase", - "DB_Object_Synonym": "NUDT4B", - "DB_Object_Type": "protein", - "Taxon": "taxon:4932", - "Date": "20211010", - "Assigned_By": "UniProt", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - # Invalid Evidence Code - coerced into 'ND' -> "ECO:0000307" - { - "DB": "UniProtKB", - "DB_Object_ID": "A0A024RBG9", - "DB_Object_Symbol": "NUDT4B", - "Qualifier": "enables", - "GO_ID": "GO:0003723", - "DB_Reference": "GO_REF:0000043", - "Evidence_Code": "XXX", # invalid Evidence Code - "With_or_From": "UniProtKB-KW:KW-0694", - "Aspect": "F", - "DB_Object_Name": "Diphosphoinositol polyphosphate phosphohydrolase", - "DB_Object_Synonym": "NUDT4B", - "DB_Object_Type": "protein", - "Taxon": "taxon:9606", - "Date": "20211010", - "Assigned_By": "UniProt", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - }, - ] - - -@pytest.fixture -def basic_go(mock_koza, source_name, test_rows, script, global_table, local_table): - """ - Mock Koza run for GO annotation ingest. - - :param mock_koza: - :param source_name: - :param test_rows: - :param script: - :param global_table: - :param local_table: - - :return: mock_koza application - """ - return mock_koza( - name=source_name, - data=test_rows, - transform_code=script, - global_table=global_table, - local_table=local_table, - map_cache=None, - ) - - -result_expected = { - # Test regular MolecularActivity go term - "UniProtKB:A0A024RBG1": [ - "biolink:Gene", - "NCBITaxon:9606", - "GO:0003723", - "biolink:MolecularActivity", - "biolink:BiologicalProcessOrActivity", - "biolink:enables", - "RO:0002327", - False, - "ECO:0000501", - ], - # Multiple Taxa - "WB:WBGene00000013": [ - "biolink:Gene", - "NCBITaxon:46170", # test for presence of the second one? - "GO:0050830", - "biolink:BiologicalProcess", - "biolink:BiologicalProcessOrActivity", - "biolink:actively_involved_in", - "RO:0002331", - False, - "ECO:0000270", - ], - # Test default qualifier override for Molecular Activity go term - "UniProtKB:A0A024RBG2": [ - "biolink:Gene", - "NCBITaxon:9606", - "GO:0003674", - "biolink:MolecularActivity", - "biolink:BiologicalProcessOrActivity", - "biolink:enables", - "RO:0002327", - False, - "ECO:0000307", - ], - # Test default qualifier override for Biological Process go term - "UniProtKB:A0A024RBG3": [ - "biolink:Gene", - "NCBITaxon:4932", - "GO:0008150", - "biolink:BiologicalProcess", - "biolink:BiologicalProcessOrActivity", - "biolink:actively_involved_in", - "RO:0002331", - False, - "ECO:0000307", - ], - # Test default qualifier override for Cellular Component go term - "UniProtKB:A0A024RBG4": [ - "biolink:Gene", - "NCBITaxon:4932", - "GO:0005575", - "biolink:CellularComponent", - "biolink:AnatomicalEntity", - "biolink:active_in", - "RO:0002432", - False, - "ECO:0000307", - ], - # Test non-default Biological Process and non-default qualifier - "UniProtKB:Q6GZX3": [ - "biolink:Gene", - "NCBITaxon:1000", - "GO:0045759", - "biolink:BiologicalProcess", - "biolink:BiologicalProcessOrActivity", - "biolink:acts_upstream_of_or_within", - "RO:0002264", - False, - "ECO:0000307", - ], - # Test outcome of unknown UniProt idmapping: uniprot id - # is returned as gene id? Also try another evidence code - "UniProtKB:A0A024RBG5": [ - "biolink:Gene", - "NCBITaxon:9606", - "GO:0003723", - "biolink:MolecularActivity", - "biolink:BiologicalProcessOrActivity", - "biolink:enables", - "RO:0002327", - False, - "ECO:0007001", - ], - # Test non-default Biological Process with negated qualifier - "UniProtKB:Q6GZX0": [ - "biolink:Gene", - "NCBITaxon:1000", - "GO:0045759", - "biolink:BiologicalProcess", - "biolink:BiologicalProcessOrActivity", - "biolink:acts_upstream_of_or_within", - "RO:0002264", - True, - "ECO:0000307", - ], - # Missing (empty) qualifier - assign GO Aspect associated default - "UniProtKB:A0A024RBG8": [ - "biolink:Gene", - "NCBITaxon:4932", - "GO:0005575", - "biolink:CellularComponent", - "biolink:AnatomicalEntity", - "biolink:located_in", - "RO:0002432", - False, - "ECO:0000307", - ], - # Invalid Evidence Code - coerced into 'ND' -> "ECO:0000307" - "UniProtKB:A0A024RBG9": [ - "biolink:Gene", - "NCBITaxon:9606", - "GO:0003723", - "biolink:MolecularActivity", - "biolink:BiologicalProcessOrActivity", - "biolink:enables", - "RO:0002327", - False, - "ECO:0000307", - ], -} - - -def test_association(basic_go): - if not len(basic_go): - logger.warning("test_association() null test?") - return - - association = basic_go[2] - assert association - assert association.subject in result_expected.keys() - - assert association.object == result_expected[association.subject][2] - assert association.predicate == result_expected[association.subject][5] - assert association.negated == result_expected[association.subject][7] - assert result_expected[association.subject][8] in association.has_evidence - - assert association.primary_knowledge_source == "infores:uniprot" - assert "infores:monarchinitiative" in association.aggregator_knowledge_source - - -@pytest.fixture -def mgi_entities(mock_koza, source_name, script, global_table, local_table): - row = { - "DB": "MGI", - "DB_Object_ID": "MGI:1918911", - "DB_Object_Symbol": "0610005C13Rik", - "Qualifier": "enables", - "GO_ID": "GO:0003674", - "DB_Reference": "MGI:MGI:2156816|GO_REF:0000015", - "Evidence_Code": "ND", - "With_or_From": "", - "Aspect": "F", - "DB_Object_Name": "RIKEN cDNA 0610005C13 gene", - "DB_Object_Synonym": "", - "DB_Object_Type": "gene", - "Taxon": "taxon:10090", - "Date": "20200917", - "Assigned_By": "MGI", - "Annotation_Extension": "", - "Gene_Product_Form_ID": "", - } - - return mock_koza( - name=source_name, - data=row, - transform_code=script, - global_table=global_table, - local_table=local_table, - map_cache=None, - ) - - -def test_mgi_curie(mgi_entities): - association = [association for association in mgi_entities if isinstance(association, Association)][0] - assert association - assert association.subject == "MGI:1918911" - assert association.publications == ["MGI:2156816", "GO_REF:0000015"] - assert association.primary_knowledge_source == "infores:mgi" - assert "infores:monarchinitiative" in association.aggregator_knowledge_source