Merge pull request #394 from dbkeator/master

dbkeator · web-flow · commit 40b6199c3cdc · 2023-12-19T14:37:36.000-08:00
fixed bug with BIDS sidecar files when supplied by user and updated tests
diff --git a/src/nidm/experiment/Utils.py b/src/nidm/experiment/Utils.py
@@ -1409,17 +1409,28 @@ def map_variables_to_terms(
     dataset_identifier=None,
 ):
     """
-
     :param df: data frame with first row containing variable names
     :param assessment_name: Name for the assessment to use in storing JSON mapping dictionary keys
     :param json_source: optional json document either in file or structure
             with variable names as keys and minimal fields "definition","label","url"
     :param output_file: output filename to save variable-> term mappings
     :param directory: if output_file parameter is set to None then use this directory to store default JSON mapping file
-    if doing variable->term mappings
+            if doing variable->term mappings
+    :param: bids: if bids is set to True then a BIDS-compliant sidecar file will be written if annotations are made
+    :param: owl_file: if a web-ontology language (OWL) file is supplied then it will be used to look for terms while
+            annotating otherwise the default NIDM terminology will be used.
+    :param associate_concepts: if this is set to True then concept association will be performed for each variable
+            otherwise it will not.
+    :param: dataset_identifier: unique identifier to identify a dataset such as a project in OpenNeuro
+            which is used in the NIDM records as a namespace to go along with a unique ID generated for the NIDM RDF graphs
     :return:return dictionary mapping variable names (i.e. columns) to terms
     """
 
+    # 12/15/23: indicator variable to identify if annotations were made with the pynidm tools. If not, and this
+    # is a bids-nidm conversion, and the user supplied a bids-compliant json sidecar file, save the original
+    # file.
+    annot_made = False
+
     # dictionary mapping column name to preferred term
     column_to_terms = {}
 
@@ -1549,11 +1560,13 @@ def map_variables_to_terms(
                         column_to_terms[current_tuple]["label"] = json_map[json_key][
                             "sourceVariable"
                         ]
+                    # this is probably a BIDS json file so use the json_key as the label
                     else:
-                        column_to_terms[current_tuple]["label"] = ""
+                        column_to_terms[current_tuple]["label"] = json_key
                         print(
-                            "No label or source_variable or sourceVariable keys found in json mapping file for variable "
-                            f"{json_key}. Consider adding these to the json file as they are important"
+                            "No label or source_variable/SourceVariable key found in json mapping file for variable "
+                            f"{json_key}. This is ok if this is a BIDS json sidecar file."
+                            "Otherwise, consider adding a label to the json file."
                         )
                 else:
                     column_to_terms[current_tuple]["label"] = json_map[json_key][
@@ -1812,6 +1825,7 @@ def map_variables_to_terms(
                         "maxValue:",
                         column_to_terms[current_tuple]["responseOptions"]["maxValue"],
                     )
+
                 if "hasUnit" in json_map[json_key]:
                     # upgrade 'hasUnit' to 'responseOptions'->'unitCode
                     if "responseOptions" not in column_to_terms[current_tuple].keys():
@@ -1849,6 +1863,7 @@ def map_variables_to_terms(
                                     ilx_obj,
                                     nidm_owl_graph=nidm_owl_graph,
                                 )
+                                annot_made = True
                                 # write annotations to json file so user can start up again if not doing whole file
                                 write_json_mapping_file(
                                     column_to_terms, output_file, bids
@@ -1917,7 +1932,8 @@ def map_variables_to_terms(
                     # if user ran in mode where they want to associate concepts and this isn't the participant
                     # id field then associate concepts.
                     if match_participant_id_field(
-                        json_map[json_key]["source_variable"]
+                        # json_map[json_key]["source_variable"]
+                        column_to_terms[current_tuple]["source_variable"]
                     ):
                         column_to_terms[current_tuple]["isAbout"] = []
                         column_to_terms[current_tuple]["isAbout"].append(
@@ -1936,6 +1952,7 @@ def map_variables_to_terms(
                             ilx_obj,
                             nidm_owl_graph=nidm_owl_graph,
                         )
+                        annot_made = True
                         # write annotations to json file so user can start up again if not doing whole file
                         write_json_mapping_file(column_to_terms, output_file, bids)
 
@@ -1990,6 +2007,8 @@ def map_variables_to_terms(
             column_to_terms[current_tuple] = {}
             # enter user interaction function to get data dictionary annotations from user
             annotate_data_element(column, current_tuple, column_to_terms)
+            # 12/15/23
+            annot_made = True
         # then ask user to find a concept if they selected to do so
         if associate_concepts:
             # provide user with opportunity to associate a concept with this annotation
@@ -2000,6 +2019,7 @@ def map_variables_to_terms(
                 ilx_obj,
                 nidm_owl_graph=nidm_owl_graph,
             )
+            annot_made = True
             # write annotations to json file so user can start up again if not doing whole file
             write_json_mapping_file(column_to_terms, output_file, bids)
 
@@ -2062,8 +2082,12 @@ def map_variables_to_terms(
             column_to_terms[current_tuple]["url"] = ilx_output.iri
         except Exception:
             print("WARNING: WIP: Data element not submitted to InterLex.  ")
-    # write annotations to json file since data element annotations are complete
-    write_json_mapping_file(column_to_terms, output_file, bids)
+
+    # 12/15/23: If doing a BIDS-NIDM conversion and the user supplied a BIDS-compliant json sidecar file
+    # and no annotations were made, leave original BIDS json file as it is...
+    if annot_made:
+        # write annotations to json file since data element annotations are complete
+        write_json_mapping_file(column_to_terms, output_file, bids)
 
     # get CDEs for data dictionary and NIDM graph entity of data
     cde = DD_to_nidm(column_to_terms, dataset_identifier=dataset_identifier)
diff --git a/src/nidm/experiment/tools/nidm_affinity_propagation.py b/src/nidm/experiment/tools/nidm_affinity_propagation.py
@@ -239,7 +239,7 @@ def data_aggregation(reporter):  # all data from all the files is collected
                     + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables."
                 )
                 for i, nf in enumerate(not_found_list):
-                    reporter.print(f"{i+1}. {nf}")
+                    reporter.print(f"{i + 1}. {nf}")
                 not_found_list.clear()
                 not_found_count += 1
                 print()
diff --git a/src/nidm/experiment/tools/nidm_agglomerative_clustering.py b/src/nidm/experiment/tools/nidm_agglomerative_clustering.py
@@ -239,7 +239,7 @@ def data_aggregation(reporter):  # all data from all the files is collected
                     + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables."
                 )
                 for i, nf in enumerate(not_found_list):
-                    reporter.print(f"{i+1}. {nf}")
+                    reporter.print(f"{i + 1}. {nf}")
                 not_found_list.clear()
                 not_found_count += 1
                 print()
diff --git a/src/nidm/experiment/tools/nidm_gmm.py b/src/nidm/experiment/tools/nidm_gmm.py
@@ -272,7 +272,7 @@ def data_aggregation(reporter):  # all data from all the files is collected
                     + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables."
                 )
                 for i, nf in enumerate(not_found_list):
-                    reporter.print(f"{i+1}. {nf}")
+                    reporter.print(f"{i + 1}. {nf}")
                 not_found_list.clear()
                 not_found_count += 1
                 print()
diff --git a/src/nidm/experiment/tools/nidm_kmeans.py b/src/nidm/experiment/tools/nidm_kmeans.py
@@ -278,7 +278,7 @@ def data_aggregation(reporter):  # all data from all the files is collected
                     + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables."
                 )
                 for i, nf in enumerate(not_found_list):
-                    reporter.print(f"{i+1}. {nf}")
+                    reporter.print(f"{i + 1}. {nf}")
                 not_found_list.clear()
                 not_found_count += 1
                 print()
diff --git a/src/nidm/experiment/tools/nidm_linreg.py b/src/nidm/experiment/tools/nidm_linreg.py
@@ -328,7 +328,7 @@ def data_aggregation(reporter):  # all data from all the files is collected
                     + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables."
                 )
                 for i, nf in enumerate(not_found_list):
-                    reporter.print(f"{i+1}. {nf}")
+                    reporter.print(f"{i + 1}. {nf}")
                 not_found_list.clear()
                 not_found_count += 1
                 print()
diff --git a/tests/experiment/test_map_vars_to_terms.py b/tests/experiment/test_map_vars_to_terms.py
@@ -1,16 +1,18 @@
 from dataclasses import dataclass
 import json
+from os.path import join
 from pathlib import Path
 import pandas as pd
 import pytest
-from nidm.experiment.Utils import map_variables_to_terms
+from nidm.experiment.Utils import map_variables_to_terms, write_json_mapping_file
 
 
 @dataclass
 class Setup:
     data: pd.DataFrame
     reproschema_json_map: dict
     bids_sidecar: dict
+    bids_sidecar_simple: dict
 
 
 @pytest.fixture(scope="module")
@@ -135,11 +137,24 @@ def setup() -> Setup:
         }
         """
     )
+    bids_sidecar_simple = json.loads(
+        """
+        {
+            "age": {
+                "description": "age of participant"
+                },
+            "sex": {
+                "description": "biological sex of participant"
+            }
+        }
+        """
+    )
 
     return Setup(
         data=data,
         reproschema_json_map=reproschema_json_map,
         bids_sidecar=bids_sidecar,
+        bids_sidecar_simple=bids_sidecar_simple,
     )
 
 
@@ -149,6 +164,7 @@ def test_map_vars_to_terms_BIDS(setup: Setup, tmp_path: Path) -> None:
     JSON sidecar file
     """
 
+    # test BIDS sidecar json file with all pynidm annotations
     column_to_terms, cde = map_variables_to_terms(
         df=setup.data,
         json_source=setup.bids_sidecar,
@@ -204,6 +220,17 @@ def test_map_vars_to_terms_BIDS(setup: Setup, tmp_path: Path) -> None:
         ]["Male"]
     )
 
+    # force writing of column_to_terms structure because here we're not doing annotations and so
+    # map_variables_to_terms won't write it out since we supplied one for it to open...thus it already exists
+    # and no annotations were made so it should exist in its original form.
+    # By explicitly writing it out here, after running map_variables_to_terms, we can assure it's the same as the
+    # original.
+
+    # write annotations to json file since data element annotations are complete
+    write_json_mapping_file(
+        column_to_terms, join(str(tmp_path), "nidm_annotations.json"), True
+    )
+
     # now check the JSON sidecar file created by map_variables_to_terms which should match BIDS format
     with open(tmp_path / "nidm_annotations.json", encoding="utf-8") as fp:
         bids_sidecar = json.load(fp)
@@ -245,6 +272,69 @@ def test_map_vars_to_terms_BIDS(setup: Setup, tmp_path: Path) -> None:
     assert len(results) == 20
 
 
+def test_map_vars_to_terms_BIDS_simple(setup: Setup, tmp_path: Path) -> None:
+    """
+    This function will test the Utils.py "map_vars_to_terms" function with a BIDS-formatted
+    JSON sidecar file
+    """
+
+    # test BIDS sidecar json file with all pynidm annotations
+    column_to_terms, cde = map_variables_to_terms(
+        df=setup.data,
+        json_source=setup.bids_sidecar_simple,
+        directory=str(tmp_path),
+        assessment_name="test",
+        associate_concepts=False,
+        bids=True,
+    )
+
+    # check whether JSON mapping structure returned from map_variables_to_terms matches the
+    # reproshema structure
+    assert "DD(source='test', variable='age')" in column_to_terms
+    assert "DD(source='test', variable='sex')" in column_to_terms
+    assert "description" in column_to_terms["DD(source='test', variable='age')"]
+    assert "description" in column_to_terms["DD(source='test', variable='sex')"]
+
+    # force writing of column_to_terms structure because here we're not doing annotations and so
+    # map_variables_to_terms won't write it out since we supplied one for it to open...thus it already exists
+    # and no annotations were made so it should exist in its original form.
+    # By explicitly writing it out here, after running map_variables_to_terms, we can assure it's the same as the
+    # original.
+
+    # write annotations to json file since data element annotations are complete
+    write_json_mapping_file(
+        column_to_terms, join(str(tmp_path), "nidm_annotations.json"), True
+    )
+
+    # now check the JSON sidecar file created by map_variables_to_terms which should match BIDS format
+    with open(tmp_path / "nidm_annotations.json", encoding="utf-8") as fp:
+        bids_sidecar = json.load(fp)
+
+    assert "age" in bids_sidecar.keys()
+    assert "sex" in bids_sidecar.keys()
+    assert "description" in bids_sidecar["age"]
+    assert "description" in bids_sidecar["sex"]
+
+    # check the CDE dataelement graph for correct information
+    query = """
+        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+        select distinct ?uuid ?DataElements ?property ?value
+            where {
+
+                ?uuid a/rdfs:subClassOf* nidm:DataElement ;
+                    ?property ?value .
+
+        }"""
+    qres = cde.query(query)
+
+    results = []
+    for row in qres:
+        results.append(list(row))
+
+    assert len(results) == 16
+
+
 def test_map_vars_to_terms_reproschema(setup: Setup, tmp_path: Path) -> None:
     """
     This function will test the Utils.py "map_vars_to_terms" function with a reproschema-formatted
@@ -305,6 +395,17 @@ def test_map_vars_to_terms_reproschema(setup: Setup, tmp_path: Path) -> None:
         ]["Male"]
     )
 
+    # force writing of column_to_terms structure because here we're not doing annotations and so
+    # map_variables_to_terms won't write it out since we supplied one for it to open...thus it already exists
+    # and no annotations were made so it should exist in its original form.
+    # By explicitly writing it out here, after running map_variables_to_terms, we can assure it's the same as the
+    # original.
+
+    # write annotations to json file since data element annotations are complete
+    write_json_mapping_file(
+        column_to_terms, join(str(tmp_path), "nidm_annotations.json"), False
+    )
+
     # now check the JSON mapping file created by map_variables_to_terms which should match Reproschema format
     with open(tmp_path / "nidm_annotations_annotations.json", encoding="utf-8") as fp:
         json.load(fp)

Original file line number	Diff line number	Diff line change
`@@ -239,7 +239,7 @@ def data_aggregation(reporter): # all data from all the files is collected`
`239`	`239`	`+ ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables."`
`240`	`240`	`)`
`241`	`241`	`for i, nf in enumerate(not_found_list):`
`242`		`- reporter.print(f"{i+1}. {nf}")`
	`242`	`+ reporter.print(f"{i + 1}. {nf}")`
`243`	`243`	`not_found_list.clear()`
`244`	`244`	`not_found_count += 1`
`245`	`245`	`print()`
Original file line number	Diff line number	Diff line change
`@@ -272,7 +272,7 @@ def data_aggregation(reporter): # all data from all the files is collected`
`272`	`272`	`+ ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables."`
`273`	`273`	`)`
`274`	`274`	`for i, nf in enumerate(not_found_list):`
`275`		`- reporter.print(f"{i+1}. {nf}")`
	`275`	`+ reporter.print(f"{i + 1}. {nf}")`
`276`	`276`	`not_found_list.clear()`
`277`	`277`	`not_found_count += 1`
`278`	`278`	`print()`
Original file line number	Diff line number	Diff line change
`@@ -278,7 +278,7 @@ def data_aggregation(reporter): # all data from all the files is collected`
`278`	`278`	`+ ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables."`
`279`	`279`	`)`
`280`	`280`	`for i, nf in enumerate(not_found_list):`
`281`		`- reporter.print(f"{i+1}. {nf}")`
	`281`	`+ reporter.print(f"{i + 1}. {nf}")`
`282`	`282`	`not_found_list.clear()`
`283`	`283`	`not_found_count += 1`
`284`	`284`	`print()`
Original file line number	Diff line number	Diff line change
`@@ -328,7 +328,7 @@ def data_aggregation(reporter): # all data from all the files is collected`
`328`	`328`	`+ ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables."`
`329`	`329`	`)`
`330`	`330`	`for i, nf in enumerate(not_found_list):`
`331`		`- reporter.print(f"{i+1}. {nf}")`
	`331`	`+ reporter.print(f"{i + 1}. {nf}")`
`332`	`332`	`not_found_list.clear()`
`333`	`333`	`not_found_count += 1`
`334`	`334`	`print()`