From d7e977b45838249a3de30681d08873ca5d43dea3 Mon Sep 17 00:00:00 2001
From: Nils Krehl <nils.krehl@posteo.de>
Date: Fri, 12 Apr 2024 12:57:42 +0200
Subject: [PATCH] explicit split between binary, continuous and categorical

---
 data/example_input/mapping.yaml               |  66 ++++++++-
 data/example_input/mock_data.csv              |  10 +-
 .../adapters/clinical_dataset_adapter.py      |  87 +++++++-----
 test/test_integration.py                      | 133 ++++++++++++++----
 4 files changed, 225 insertions(+), 71 deletions(-)

diff --git a/data/example_input/mapping.yaml b/data/example_input/mapping.yaml
index 675c994..eb9ea39 100644
--- a/data/example_input/mapping.yaml
+++ b/data/example_input/mapping.yaml
@@ -4,60 +4,107 @@ Nodes:
     coding_system: snomedct
     object_type: instance
     id_in_coding_system: 116154003
+    col_value_type: categorical
   Overall Survival (days):
     coding_system: snomedct
     object_type: concept
     id_in_coding_system: 445320007
+    col_value_type: continuous
+  death_observed:
+    coding_system: snomedct
+    object_type: concept
+    id_in_coding_system: 419620001
+    col_value_type: continuous
   Clinical_Oxygen saturation in Arterial blood ; %:
     coding_system: snomedct
     object_type: concept
     id_in_coding_system: 442476006
+    col_value_type: continuous
+  gender_female:
+    coding_system: snomedct
+    object_type: concept
+    id_in_coding_system: 1086007
+    col_value_type: binary
   # Lab values (Loinc)
   LAB_Eos. Granulozyten# ; /nl:
     coding_system: loinc
     object_type: concept
     id_in_coding_system: 26449-9
+    col_value_type: continuous
+  Grading:
+    coding_system: loinc
+    object_type: concept
+    id_in_coding_system: 59542-1
+    col_value_type: categorical
+  KRAS:
+    coding_system: loinc
+    object_type: concept
+    id_in_coding_system: 21702-6
+    col_value_type: binary
   # Diseases (ICD)
   ICD_B95:
     coding_system: icd10
     object_type: concept
     id_in_coding_system: B95
+    col_value_type: binary
   ICD_A02:
     coding_system: icd10
     object_type: concept
     id_in_coding_system: A02
+    col_value_type: binary
   Cancer_C01:
     coding_system: icd10
     object_type: concept
     id_in_coding_system: C01
+    col_value_type: binary
   # Operations and procedures (german OPS)
   OPS_1-100:
     coding_system: ops
     object_type: concept
     id_in_coding_system: 1-100
+    col_value_type: binary
   # not mapped columns
-  not_mapped_discrete_value:
+  not_mapped_binary_value:
     coding_system: not_mapped_to_ontology
     object_type: concept
     id_in_coding_system: .nan
+    col_value_type: binary
   not_mapped_continuous_value:
     coding_system: not_mapped_to_ontology
     object_type: concept
     id_in_coding_system: .nan
+    col_value_type: continuous
+  not_mapped_categorical_value:
+    coding_system: not_mapped_to_ontology
+    object_type: concept
+    id_in_coding_system: .nan
+    col_value_type: categorical
 
 Edges:
-  HAS_CLINICAL_PARAMETER:
+  HAS_CLINICAL_PARAMETER_BINARY:
+    source_node: Patient ID
+    target_nodes: [gender_female]
+  HAS_CLINICAL_PARAMETER_CONTINUOUS:
     source_node: Patient ID
-    target_nodes: [Overall Survival (days), Clinical_Oxygen saturation in Arterial blood ; %]
+    target_nodes: [Overall Survival (days), death_observed, Clinical_Oxygen saturation in Arterial blood ; %]
     properties:
       value:
-        type: float # TODO: use float because int is also float (but is this really a good solution?)
-  HAS_LAB_VALUE:
+        type: float
+  HAS_LAB_VALUE_BINARY:
+    source_node: Patient ID
+    target_nodes: [KRAS]
+  HAS_LAB_VALUE_CONTINUOUS:
     source_node: Patient ID
     target_nodes: [LAB_Eos. Granulozyten# ; /nl]
     properties:
       value:
         type: float
+  HAS_LAB_VALUE_CATEGORICAL:
+    source_node: Patient ID
+    target_nodes: [Grading]
+    properties:
+      value:
+        type: float
   HAS_DISEASE:
     source_node: Patient ID
     target_nodes: [ICD_B95, ICD_A02, Cancer_C01]
@@ -67,7 +114,7 @@ Edges:
   NOT_DEFINED_BINARY:
     source_node: Patient ID
     target_nodes:
-      - not_mapped_discrete_value
+      - not_mapped_binary_value
   NOT_DEFINED_CONTINUOUS:
     properties:
       value:
@@ -75,3 +122,10 @@ Edges:
     source_node: Patient ID
     target_nodes:
       - not_mapped_continuous_value
+  NOT_DEFINED_CATEGORICAL:
+    properties:
+      value:
+        type: float
+    source_node: Patient ID
+    target_nodes:
+      - not_mapped_categorical_value
diff --git a/data/example_input/mock_data.csv b/data/example_input/mock_data.csv
index f9a0ee8..3b32036 100644
--- a/data/example_input/mock_data.csv
+++ b/data/example_input/mock_data.csv
@@ -1,5 +1,5 @@
-Patient ID,Overall Survival (days),LAB_Eos. Granulozyten# ; /nl,ICD_B95,ICD_A02,Clinical_Oxygen saturation in Arterial blood ; %,Cancer_C01,OPS_1-100,not_mapped_discrete_value,not_mapped_continuous_value
-1,150,0.11,0,1,97,0,1,1,0.1
-2,164,0.12,1,1,96,0,1,0,0.0
-3,"",,,,,1,,,
-4,,0.14,0,0,94,,0,,
+Patient ID,Overall Survival (days),death_observed,Clinical_Oxygen saturation in Arterial blood ; %,gender_female,LAB_Eos. Granulozyten# ; /nl,Grading,KRAS,ICD_B95,ICD_A02,Cancer_C01,OPS_1-100,not_mapped_binary_value,not_mapped_continuous_value,not_mapped_categorical_value
+1,150,1,97,0,0.11,1,1,0,1,0,1,1,0.1,1
+2,164,0,96,1,0.12,2,0,1,1,0,1,0,0,2
+3,,,,,,,,,,1,,,,
+4,"","",94,1,0.14,3,0,0,0,,0,,,3
diff --git a/patient_kg/adapters/clinical_dataset_adapter.py b/patient_kg/adapters/clinical_dataset_adapter.py
index 51bf9d2..c212c1a 100644
--- a/patient_kg/adapters/clinical_dataset_adapter.py
+++ b/patient_kg/adapters/clinical_dataset_adapter.py
@@ -170,49 +170,75 @@ def get_edges(self):
             source_node_id = self.dataset_mapping["Nodes"][source_node][
                 "id_in_coding_system"
             ]
-            target_nodes_dict = {}
+            target_node_id_to_col_name = {}
             for target_node_name in target_nodes_list:
                 target_node_coding_system = self.dataset_mapping["Nodes"][
                     target_node_name
                 ]["coding_system"]
                 if "not_mapped_to_ontology" in target_node_coding_system:
-                    target_nodes_dict[target_node_name] = target_node_name
+                    target_node_id_to_col_name[
+                        target_node_name
+                    ] = target_node_name
                 else:
-                    target_nodes_dict[
+                    target_node_id_to_col_name[
                         self.dataset_mapping["Nodes"][target_node_name][
                             "id_in_coding_system"
                         ]
                     ] = target_node_name
 
-            for target_node_id, target_node_value in target_nodes_dict.items():
+            for (
+                target_node_id,
+                target_node_col_name,
+            ) in target_node_id_to_col_name.items():
+                target_node_col_value_type = self.dataset_mapping["Nodes"][
+                    target_node_col_name
+                ]["col_value_type"]
+
                 for row_index, row in self.dataset.iterrows():
+                    edge = None
                     source_node_id_instance = (
                         f"{source_node_id}_{int(row[source_node])}"
                     )
-
                     properties = {}
-                    if defined_properties is not None:
-                        # weighted edge_name
-                        for property in defined_properties:
-                            if (
-                                not pd.isna(row[target_node_value])
-                                or row[target_node_value] == ""
-                            ):
+
+                    if (
+                        target_node_col_value_type == "continuous"
+                        or target_node_col_value_type == "categorical"
+                    ):
+                        if defined_properties is not None:
+                            # weighted edge_name
+                            for property in defined_properties:
                                 if (
-                                    defined_properties[property]["type"]
-                                    == "int"
-                                ):
-                                    properties[property] = int(
-                                        row[target_node_value]
-                                    )
-                                elif (
-                                    defined_properties[property]["type"]
-                                    == "float"
+                                    not pd.isna(row[target_node_col_name])
+                                    or row[target_node_col_name] == ""
                                 ):
-                                    properties[property] = float(
-                                        row[target_node_value]
-                                    )
-                        if properties != {}:
+                                    if (
+                                        defined_properties[property]["type"]
+                                        == "int"
+                                    ):
+                                        properties[property] = int(
+                                            row[target_node_col_name]
+                                        )
+                                    elif (
+                                        defined_properties[property]["type"]
+                                        == "float"
+                                    ):
+                                        properties[property] = float(
+                                            row[target_node_col_name]
+                                        )
+                            if properties != {}:
+                                relationship_id = "E" + str(edge_id)
+                                edge_id += 1
+                                edge = Edge.create_instance(
+                                    relationship_id,
+                                    source_node_id_instance,
+                                    target_node_id,
+                                    edge_name,
+                                    properties,
+                                )
+                    if target_node_col_value_type == "binary":
+                        if row[target_node_col_name] == 1:
+                            # binary edge_name
                             relationship_id = "E" + str(edge_id)
                             edge_id += 1
                             edge = Edge.create_instance(
@@ -222,17 +248,6 @@ def get_edges(self):
                                 edge_name,
                                 properties,
                             )
-                    if row[target_node_value] == 1:
-                        # binary edge_name
-                        relationship_id = "E" + str(edge_id)
-                        edge_id += 1
-                        edge = Edge.create_instance(
-                            relationship_id,
-                            source_node_id_instance,
-                            target_node_id,
-                            edge_name,
-                            properties,
-                        )
 
                     if edge is not None:
                         # logger.info(f"Adding edge_name {edge.get_relationship_id()}, {edge.get_source_node_id()}, {edge.get_target_node_id()}, {edge.get_label()}, {edge.get_properties()}")
diff --git a/test/test_integration.py b/test/test_integration.py
index b97023f..688ab67 100644
--- a/test/test_integration.py
+++ b/test/test_integration.py
@@ -15,8 +15,12 @@ def test_adapter_end_to_end():
         ("116154003_3", "patient (person)", {}),
         ("116154003_4", "patient (person)", {}),
         ("445320007", "survival time (observable entity)", {}),
+        ("419620001", "death (event)", {}),
         ("442476006", "arterial oxygen saturation (observable entity)", {}),
+        ("1086007", "female structure (body structure)", {}),
         ("26449-9", "26449-9", {}),
+        ("59542-1", "59542-1", {}),
+        ("21702-6", "21702-6", {}),
         (
             "B95",
             "streptococcus, staphylococcus, and enterococcus as the cause of diseases classified elsewhere",
@@ -25,80 +29,161 @@ def test_adapter_end_to_end():
         ("A02", "other salmonella infections", {}),
         ("C01", "malignant neoplasm of base of tongue", {}),
         ("1-100", "1-100", {}),
-        ("not_mapped_discrete_value", "not_mapped_discrete_value", {}),
+        ("not_mapped_binary_value", "not_mapped_binary_value", {}),
         ("not_mapped_continuous_value", "not_mapped_continuous_value", {}),
+        ("not_mapped_categorical_value", "not_mapped_categorical_value", {}),
     ]
     for node in nodes:
         assert node in expected_nodes
 
     expected_edges = [
+        ("E0", "116154003_2", "1086007", "HAS_CLINICAL_PARAMETER_BINARY", {}),
+        ("E1", "116154003_4", "1086007", "HAS_CLINICAL_PARAMETER_BINARY", {}),
         (
-            "E0",
+            "E2",
             "116154003_1",
             "445320007",
-            "HAS_CLINICAL_PARAMETER",
+            "HAS_CLINICAL_PARAMETER_CONTINUOUS",
             {"value": 150.0},
         ),
         (
-            "E1",
+            "E3",
             "116154003_2",
             "445320007",
-            "HAS_CLINICAL_PARAMETER",
+            "HAS_CLINICAL_PARAMETER_CONTINUOUS",
             {"value": 164.0},
         ),
         (
-            "E2",
+            "E4",
+            "116154003_1",
+            "419620001",
+            "HAS_CLINICAL_PARAMETER_CONTINUOUS",
+            {"value": 1.0},
+        ),
+        (
+            "E5",
+            "116154003_2",
+            "419620001",
+            "HAS_CLINICAL_PARAMETER_CONTINUOUS",
+            {"value": 0.0},
+        ),
+        (
+            "E6",
             "116154003_1",
             "442476006",
-            "HAS_CLINICAL_PARAMETER",
+            "HAS_CLINICAL_PARAMETER_CONTINUOUS",
             {"value": 97.0},
         ),
         (
-            "E3",
+            "E7",
             "116154003_2",
             "442476006",
-            "HAS_CLINICAL_PARAMETER",
+            "HAS_CLINICAL_PARAMETER_CONTINUOUS",
             {"value": 96.0},
         ),
         (
-            "E4",
+            "E8",
             "116154003_4",
             "442476006",
-            "HAS_CLINICAL_PARAMETER",
+            "HAS_CLINICAL_PARAMETER_CONTINUOUS",
             {"value": 94.0},
         ),
-        ("E5", "116154003_1", "26449-9", "HAS_LAB_VALUE", {"value": 0.11}),
-        ("E6", "116154003_2", "26449-9", "HAS_LAB_VALUE", {"value": 0.12}),
-        ("E7", "116154003_4", "26449-9", "HAS_LAB_VALUE", {"value": 0.14}),
-        ("E8", "116154003_2", "B95", "HAS_DISEASE", {}),
-        ("E9", "116154003_1", "A02", "HAS_DISEASE", {}),
-        ("E10", "116154003_2", "A02", "HAS_DISEASE", {}),
-        ("E11", "116154003_3", "C01", "HAS_DISEASE", {}),
-        ("E12", "116154003_1", "1-100", "HAS_TREATMENT", {}),
-        ("E13", "116154003_2", "1-100", "HAS_TREATMENT", {}),
+        ("E9", "116154003_1", "21702-6", "HAS_LAB_VALUE_BINARY", {}),
+        (
+            "E10",
+            "116154003_1",
+            "26449-9",
+            "HAS_LAB_VALUE_CONTINUOUS",
+            {"value": 0.11},
+        ),
+        (
+            "E11",
+            "116154003_2",
+            "26449-9",
+            "HAS_LAB_VALUE_CONTINUOUS",
+            {"value": 0.12},
+        ),
+        (
+            "E12",
+            "116154003_4",
+            "26449-9",
+            "HAS_LAB_VALUE_CONTINUOUS",
+            {"value": 0.14},
+        ),
+        (
+            "E13",
+            "116154003_1",
+            "59542-1",
+            "HAS_LAB_VALUE_CATEGORICAL",
+            {"value": 1.0},
+        ),
         (
             "E14",
+            "116154003_2",
+            "59542-1",
+            "HAS_LAB_VALUE_CATEGORICAL",
+            {"value": 2.0},
+        ),
+        (
+            "E15",
+            "116154003_4",
+            "59542-1",
+            "HAS_LAB_VALUE_CATEGORICAL",
+            {"value": 3.0},
+        ),
+        ("E16", "116154003_2", "B95", "HAS_DISEASE", {}),
+        ("E17", "116154003_1", "A02", "HAS_DISEASE", {}),
+        ("E18", "116154003_2", "A02", "HAS_DISEASE", {}),
+        ("E19", "116154003_3", "C01", "HAS_DISEASE", {}),
+        ("E20", "116154003_1", "1-100", "HAS_TREATMENT", {}),
+        ("E21", "116154003_2", "1-100", "HAS_TREATMENT", {}),
+        (
+            "E22",
             "116154003_1",
-            "not_mapped_discrete_value",
+            "not_mapped_binary_value",
             "NOT_DEFINED_BINARY",
             {},
         ),
         (
-            "E15",
+            "E23",
             "116154003_1",
             "not_mapped_continuous_value",
             "NOT_DEFINED_CONTINUOUS",
             {"value": 0.1},
         ),
         (
-            "E16",
+            "E24",
             "116154003_2",
             "not_mapped_continuous_value",
             "NOT_DEFINED_CONTINUOUS",
             {"value": 0.0},
         ),
+        (
+            "E25",
+            "116154003_1",
+            "not_mapped_categorical_value",
+            "NOT_DEFINED_CATEGORICAL",
+            {"value": 1.0},
+        ),
+        (
+            "E26",
+            "116154003_2",
+            "not_mapped_categorical_value",
+            "NOT_DEFINED_CATEGORICAL",
+            {"value": 2.0},
+        ),
+        (
+            "E27",
+            "116154003_4",
+            "not_mapped_categorical_value",
+            "NOT_DEFINED_CATEGORICAL",
+            {"value": 3.0},
+        ),
     ]
     edges = list(adapter.get_edges())
+    # order edges by the first element of the tuple
+    edges = sorted(edges, key=lambda x: x[0])
+    print(edges)
     for edge in edges:
         assert edge in expected_edges
-    assert len(edges) == 17
+    assert len(edges) == 28