explicit split between binary, continuous and categorical

biocypher · Apr 12, 2024 · d7e977b · d7e977b
1 parent 160532b
commit d7e977b
Show file tree

Hide file tree

Showing 4 changed files with 225 additions and 71 deletions.
diff --git a/data/example_input/mapping.yaml b/data/example_input/mapping.yaml
@@ -4,60 +4,107 @@ Nodes:
     coding_system: snomedct
     object_type: instance
     id_in_coding_system: 116154003
+    col_value_type: categorical
   Overall Survival (days):
     coding_system: snomedct
     object_type: concept
     id_in_coding_system: 445320007
+    col_value_type: continuous
+  death_observed:
+    coding_system: snomedct
+    object_type: concept
+    id_in_coding_system: 419620001
+    col_value_type: continuous
   Clinical_Oxygen saturation in Arterial blood ; %:
     coding_system: snomedct
     object_type: concept
     id_in_coding_system: 442476006
+    col_value_type: continuous
+  gender_female:
+    coding_system: snomedct
+    object_type: concept
+    id_in_coding_system: 1086007
+    col_value_type: binary
   # Lab values (Loinc)
   LAB_Eos. Granulozyten# ; /nl:
     coding_system: loinc
     object_type: concept
     id_in_coding_system: 26449-9
+    col_value_type: continuous
+  Grading:
+    coding_system: loinc
+    object_type: concept
+    id_in_coding_system: 59542-1
+    col_value_type: categorical
+  KRAS:
+    coding_system: loinc
+    object_type: concept
+    id_in_coding_system: 21702-6
+    col_value_type: binary
   # Diseases (ICD)
   ICD_B95:
     coding_system: icd10
     object_type: concept
     id_in_coding_system: B95
+    col_value_type: binary
   ICD_A02:
     coding_system: icd10
     object_type: concept
     id_in_coding_system: A02
+    col_value_type: binary
   Cancer_C01:
     coding_system: icd10
     object_type: concept
     id_in_coding_system: C01
+    col_value_type: binary
   # Operations and procedures (german OPS)
   OPS_1-100:
     coding_system: ops
     object_type: concept
     id_in_coding_system: 1-100
+    col_value_type: binary
   # not mapped columns
-  not_mapped_discrete_value:
+  not_mapped_binary_value:
     coding_system: not_mapped_to_ontology
     object_type: concept
     id_in_coding_system: .nan
+    col_value_type: binary
   not_mapped_continuous_value:
     coding_system: not_mapped_to_ontology
     object_type: concept
     id_in_coding_system: .nan
+    col_value_type: continuous
+  not_mapped_categorical_value:
+    coding_system: not_mapped_to_ontology
+    object_type: concept
+    id_in_coding_system: .nan
+    col_value_type: categorical
 
 Edges:
-  HAS_CLINICAL_PARAMETER:
+  HAS_CLINICAL_PARAMETER_BINARY:
+    source_node: Patient ID
+    target_nodes: [gender_female]
+  HAS_CLINICAL_PARAMETER_CONTINUOUS:
     source_node: Patient ID
-    target_nodes: [Overall Survival (days), Clinical_Oxygen saturation in Arterial blood ; %]
+    target_nodes: [Overall Survival (days), death_observed, Clinical_Oxygen saturation in Arterial blood ; %]
     properties:
       value:
-        type: float # TODO: use float because int is also float (but is this really a good solution?)
-  HAS_LAB_VALUE:
+        type: float
+  HAS_LAB_VALUE_BINARY:
+    source_node: Patient ID
+    target_nodes: [KRAS]
+  HAS_LAB_VALUE_CONTINUOUS:
     source_node: Patient ID
     target_nodes: [LAB_Eos. Granulozyten# ; /nl]
     properties:
       value:
         type: float
+  HAS_LAB_VALUE_CATEGORICAL:
+    source_node: Patient ID
+    target_nodes: [Grading]
+    properties:
+      value:
+        type: float
   HAS_DISEASE:
     source_node: Patient ID
     target_nodes: [ICD_B95, ICD_A02, Cancer_C01]
@@ -67,11 +114,18 @@ Edges:
   NOT_DEFINED_BINARY:
     source_node: Patient ID
     target_nodes:
-      - not_mapped_discrete_value
+      - not_mapped_binary_value
   NOT_DEFINED_CONTINUOUS:
     properties:
       value:
         type: float
     source_node: Patient ID
     target_nodes:
       - not_mapped_continuous_value
+  NOT_DEFINED_CATEGORICAL:
+    properties:
+      value:
+        type: float
+    source_node: Patient ID
+    target_nodes:
+      - not_mapped_categorical_value
diff --git a/data/example_input/mock_data.csv b/data/example_input/mock_data.csv
@@ -1,5 +1,5 @@
-Patient ID,Overall Survival (days),LAB_Eos. Granulozyten# ; /nl,ICD_B95,ICD_A02,Clinical_Oxygen saturation in Arterial blood ; %,Cancer_C01,OPS_1-100,not_mapped_discrete_value,not_mapped_continuous_value
-1,150,0.11,0,1,97,0,1,1,0.1
-2,164,0.12,1,1,96,0,1,0,0.0
-3,"",,,,,1,,,
-4,,0.14,0,0,94,,0,,
+Patient ID,Overall Survival (days),death_observed,Clinical_Oxygen saturation in Arterial blood ; %,gender_female,LAB_Eos. Granulozyten# ; /nl,Grading,KRAS,ICD_B95,ICD_A02,Cancer_C01,OPS_1-100,not_mapped_binary_value,not_mapped_continuous_value,not_mapped_categorical_value
+1,150,1,97,0,0.11,1,1,0,1,0,1,1,0.1,1
+2,164,0,96,1,0.12,2,0,1,1,0,1,0,0,2
+3,,,,,,,,,,1,,,,
+4,"","",94,1,0.14,3,0,0,0,,0,,,3
diff --git a/patient_kg/adapters/clinical_dataset_adapter.py b/patient_kg/adapters/clinical_dataset_adapter.py
@@ -170,49 +170,75 @@ def get_edges(self):
             source_node_id = self.dataset_mapping["Nodes"][source_node][
                 "id_in_coding_system"
             ]
-            target_nodes_dict = {}
+            target_node_id_to_col_name = {}
             for target_node_name in target_nodes_list:
                 target_node_coding_system = self.dataset_mapping["Nodes"][
                     target_node_name
                 ]["coding_system"]
                 if "not_mapped_to_ontology" in target_node_coding_system:
-                    target_nodes_dict[target_node_name] = target_node_name
+                    target_node_id_to_col_name[
+                        target_node_name
+                    ] = target_node_name
                 else:
-                    target_nodes_dict[
+                    target_node_id_to_col_name[
                         self.dataset_mapping["Nodes"][target_node_name][
                             "id_in_coding_system"
                         ]
                     ] = target_node_name
 
-            for target_node_id, target_node_value in target_nodes_dict.items():
+            for (
+                target_node_id,
+                target_node_col_name,
+            ) in target_node_id_to_col_name.items():
+                target_node_col_value_type = self.dataset_mapping["Nodes"][
+                    target_node_col_name
+                ]["col_value_type"]
+
                 for row_index, row in self.dataset.iterrows():
+                    edge = None
                     source_node_id_instance = (
                         f"{source_node_id}_{int(row[source_node])}"
                     )
-
                     properties = {}
-                    if defined_properties is not None:
-                        # weighted edge_name
-                        for property in defined_properties:
-                            if (
-                                not pd.isna(row[target_node_value])
-                                or row[target_node_value] == ""
-                            ):
+
+                    if (
+                        target_node_col_value_type == "continuous"
+                        or target_node_col_value_type == "categorical"
+                    ):
+                        if defined_properties is not None:
+                            # weighted edge_name
+                            for property in defined_properties:
                                 if (
-                                    defined_properties[property]["type"]
-                                    == "int"
-                                ):
-                                    properties[property] = int(
-                                        row[target_node_value]
-                                    )
-                                elif (
-                                    defined_properties[property]["type"]
-                                    == "float"
+                                    not pd.isna(row[target_node_col_name])
+                                    or row[target_node_col_name] == ""
                                 ):
-                                    properties[property] = float(
-                                        row[target_node_value]
-                                    )
-                        if properties != {}:
+                                    if (
+                                        defined_properties[property]["type"]
+                                        == "int"
+                                    ):
+                                        properties[property] = int(
+                                            row[target_node_col_name]
+                                        )
+                                    elif (
+                                        defined_properties[property]["type"]
+                                        == "float"
+                                    ):
+                                        properties[property] = float(
+                                            row[target_node_col_name]
+                                        )
+                            if properties != {}:
+                                relationship_id = "E" + str(edge_id)
+                                edge_id += 1
+                                edge = Edge.create_instance(
+                                    relationship_id,
+                                    source_node_id_instance,
+                                    target_node_id,
+                                    edge_name,
+                                    properties,
+                                )
+                    if target_node_col_value_type == "binary":
+                        if row[target_node_col_name] == 1:
+                            # binary edge_name
                             relationship_id = "E" + str(edge_id)
                             edge_id += 1
                             edge = Edge.create_instance(
@@ -222,17 +248,6 @@ def get_edges(self):
                                 edge_name,
                                 properties,
                             )
-                    if row[target_node_value] == 1:
-                        # binary edge_name
-                        relationship_id = "E" + str(edge_id)
-                        edge_id += 1
-                        edge = Edge.create_instance(
-                            relationship_id,
-                            source_node_id_instance,
-                            target_node_id,
-                            edge_name,
-                            properties,
-                        )
 
                     if edge is not None:
                         # logger.info(f"Adding edge_name {edge.get_relationship_id()}, {edge.get_source_node_id()}, {edge.get_target_node_id()}, {edge.get_label()}, {edge.get_properties()}")