From d7e977b45838249a3de30681d08873ca5d43dea3 Mon Sep 17 00:00:00 2001 From: Nils Krehl Date: Fri, 12 Apr 2024 12:57:42 +0200 Subject: [PATCH] explicit split between binary, continuous and categorical --- data/example_input/mapping.yaml | 66 ++++++++- data/example_input/mock_data.csv | 10 +- .../adapters/clinical_dataset_adapter.py | 87 +++++++----- test/test_integration.py | 133 ++++++++++++++---- 4 files changed, 225 insertions(+), 71 deletions(-) diff --git a/data/example_input/mapping.yaml b/data/example_input/mapping.yaml index 675c994..eb9ea39 100644 --- a/data/example_input/mapping.yaml +++ b/data/example_input/mapping.yaml @@ -4,60 +4,107 @@ Nodes: coding_system: snomedct object_type: instance id_in_coding_system: 116154003 + col_value_type: categorical Overall Survival (days): coding_system: snomedct object_type: concept id_in_coding_system: 445320007 + col_value_type: continuous + death_observed: + coding_system: snomedct + object_type: concept + id_in_coding_system: 419620001 + col_value_type: continuous Clinical_Oxygen saturation in Arterial blood ; %: coding_system: snomedct object_type: concept id_in_coding_system: 442476006 + col_value_type: continuous + gender_female: + coding_system: snomedct + object_type: concept + id_in_coding_system: 1086007 + col_value_type: binary # Lab values (Loinc) LAB_Eos. Granulozyten# ; /nl: coding_system: loinc object_type: concept id_in_coding_system: 26449-9 + col_value_type: continuous + Grading: + coding_system: loinc + object_type: concept + id_in_coding_system: 59542-1 + col_value_type: categorical + KRAS: + coding_system: loinc + object_type: concept + id_in_coding_system: 21702-6 + col_value_type: binary # Diseases (ICD) ICD_B95: coding_system: icd10 object_type: concept id_in_coding_system: B95 + col_value_type: binary ICD_A02: coding_system: icd10 object_type: concept id_in_coding_system: A02 + col_value_type: binary Cancer_C01: coding_system: icd10 object_type: concept id_in_coding_system: C01 + col_value_type: binary # Operations and procedures (german OPS) OPS_1-100: coding_system: ops object_type: concept id_in_coding_system: 1-100 + col_value_type: binary # not mapped columns - not_mapped_discrete_value: + not_mapped_binary_value: coding_system: not_mapped_to_ontology object_type: concept id_in_coding_system: .nan + col_value_type: binary not_mapped_continuous_value: coding_system: not_mapped_to_ontology object_type: concept id_in_coding_system: .nan + col_value_type: continuous + not_mapped_categorical_value: + coding_system: not_mapped_to_ontology + object_type: concept + id_in_coding_system: .nan + col_value_type: categorical Edges: - HAS_CLINICAL_PARAMETER: + HAS_CLINICAL_PARAMETER_BINARY: + source_node: Patient ID + target_nodes: [gender_female] + HAS_CLINICAL_PARAMETER_CONTINUOUS: source_node: Patient ID - target_nodes: [Overall Survival (days), Clinical_Oxygen saturation in Arterial blood ; %] + target_nodes: [Overall Survival (days), death_observed, Clinical_Oxygen saturation in Arterial blood ; %] properties: value: - type: float # TODO: use float because int is also float (but is this really a good solution?) - HAS_LAB_VALUE: + type: float + HAS_LAB_VALUE_BINARY: + source_node: Patient ID + target_nodes: [KRAS] + HAS_LAB_VALUE_CONTINUOUS: source_node: Patient ID target_nodes: [LAB_Eos. Granulozyten# ; /nl] properties: value: type: float + HAS_LAB_VALUE_CATEGORICAL: + source_node: Patient ID + target_nodes: [Grading] + properties: + value: + type: float HAS_DISEASE: source_node: Patient ID target_nodes: [ICD_B95, ICD_A02, Cancer_C01] @@ -67,7 +114,7 @@ Edges: NOT_DEFINED_BINARY: source_node: Patient ID target_nodes: - - not_mapped_discrete_value + - not_mapped_binary_value NOT_DEFINED_CONTINUOUS: properties: value: @@ -75,3 +122,10 @@ Edges: source_node: Patient ID target_nodes: - not_mapped_continuous_value + NOT_DEFINED_CATEGORICAL: + properties: + value: + type: float + source_node: Patient ID + target_nodes: + - not_mapped_categorical_value diff --git a/data/example_input/mock_data.csv b/data/example_input/mock_data.csv index f9a0ee8..3b32036 100644 --- a/data/example_input/mock_data.csv +++ b/data/example_input/mock_data.csv @@ -1,5 +1,5 @@ -Patient ID,Overall Survival (days),LAB_Eos. Granulozyten# ; /nl,ICD_B95,ICD_A02,Clinical_Oxygen saturation in Arterial blood ; %,Cancer_C01,OPS_1-100,not_mapped_discrete_value,not_mapped_continuous_value -1,150,0.11,0,1,97,0,1,1,0.1 -2,164,0.12,1,1,96,0,1,0,0.0 -3,"",,,,,1,,, -4,,0.14,0,0,94,,0,, +Patient ID,Overall Survival (days),death_observed,Clinical_Oxygen saturation in Arterial blood ; %,gender_female,LAB_Eos. Granulozyten# ; /nl,Grading,KRAS,ICD_B95,ICD_A02,Cancer_C01,OPS_1-100,not_mapped_binary_value,not_mapped_continuous_value,not_mapped_categorical_value +1,150,1,97,0,0.11,1,1,0,1,0,1,1,0.1,1 +2,164,0,96,1,0.12,2,0,1,1,0,1,0,0,2 +3,,,,,,,,,,1,,,, +4,"","",94,1,0.14,3,0,0,0,,0,,,3 diff --git a/patient_kg/adapters/clinical_dataset_adapter.py b/patient_kg/adapters/clinical_dataset_adapter.py index 51bf9d2..c212c1a 100644 --- a/patient_kg/adapters/clinical_dataset_adapter.py +++ b/patient_kg/adapters/clinical_dataset_adapter.py @@ -170,49 +170,75 @@ def get_edges(self): source_node_id = self.dataset_mapping["Nodes"][source_node][ "id_in_coding_system" ] - target_nodes_dict = {} + target_node_id_to_col_name = {} for target_node_name in target_nodes_list: target_node_coding_system = self.dataset_mapping["Nodes"][ target_node_name ]["coding_system"] if "not_mapped_to_ontology" in target_node_coding_system: - target_nodes_dict[target_node_name] = target_node_name + target_node_id_to_col_name[ + target_node_name + ] = target_node_name else: - target_nodes_dict[ + target_node_id_to_col_name[ self.dataset_mapping["Nodes"][target_node_name][ "id_in_coding_system" ] ] = target_node_name - for target_node_id, target_node_value in target_nodes_dict.items(): + for ( + target_node_id, + target_node_col_name, + ) in target_node_id_to_col_name.items(): + target_node_col_value_type = self.dataset_mapping["Nodes"][ + target_node_col_name + ]["col_value_type"] + for row_index, row in self.dataset.iterrows(): + edge = None source_node_id_instance = ( f"{source_node_id}_{int(row[source_node])}" ) - properties = {} - if defined_properties is not None: - # weighted edge_name - for property in defined_properties: - if ( - not pd.isna(row[target_node_value]) - or row[target_node_value] == "" - ): + + if ( + target_node_col_value_type == "continuous" + or target_node_col_value_type == "categorical" + ): + if defined_properties is not None: + # weighted edge_name + for property in defined_properties: if ( - defined_properties[property]["type"] - == "int" - ): - properties[property] = int( - row[target_node_value] - ) - elif ( - defined_properties[property]["type"] - == "float" + not pd.isna(row[target_node_col_name]) + or row[target_node_col_name] == "" ): - properties[property] = float( - row[target_node_value] - ) - if properties != {}: + if ( + defined_properties[property]["type"] + == "int" + ): + properties[property] = int( + row[target_node_col_name] + ) + elif ( + defined_properties[property]["type"] + == "float" + ): + properties[property] = float( + row[target_node_col_name] + ) + if properties != {}: + relationship_id = "E" + str(edge_id) + edge_id += 1 + edge = Edge.create_instance( + relationship_id, + source_node_id_instance, + target_node_id, + edge_name, + properties, + ) + if target_node_col_value_type == "binary": + if row[target_node_col_name] == 1: + # binary edge_name relationship_id = "E" + str(edge_id) edge_id += 1 edge = Edge.create_instance( @@ -222,17 +248,6 @@ def get_edges(self): edge_name, properties, ) - if row[target_node_value] == 1: - # binary edge_name - relationship_id = "E" + str(edge_id) - edge_id += 1 - edge = Edge.create_instance( - relationship_id, - source_node_id_instance, - target_node_id, - edge_name, - properties, - ) if edge is not None: # logger.info(f"Adding edge_name {edge.get_relationship_id()}, {edge.get_source_node_id()}, {edge.get_target_node_id()}, {edge.get_label()}, {edge.get_properties()}") diff --git a/test/test_integration.py b/test/test_integration.py index b97023f..688ab67 100644 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -15,8 +15,12 @@ def test_adapter_end_to_end(): ("116154003_3", "patient (person)", {}), ("116154003_4", "patient (person)", {}), ("445320007", "survival time (observable entity)", {}), + ("419620001", "death (event)", {}), ("442476006", "arterial oxygen saturation (observable entity)", {}), + ("1086007", "female structure (body structure)", {}), ("26449-9", "26449-9", {}), + ("59542-1", "59542-1", {}), + ("21702-6", "21702-6", {}), ( "B95", "streptococcus, staphylococcus, and enterococcus as the cause of diseases classified elsewhere", @@ -25,80 +29,161 @@ def test_adapter_end_to_end(): ("A02", "other salmonella infections", {}), ("C01", "malignant neoplasm of base of tongue", {}), ("1-100", "1-100", {}), - ("not_mapped_discrete_value", "not_mapped_discrete_value", {}), + ("not_mapped_binary_value", "not_mapped_binary_value", {}), ("not_mapped_continuous_value", "not_mapped_continuous_value", {}), + ("not_mapped_categorical_value", "not_mapped_categorical_value", {}), ] for node in nodes: assert node in expected_nodes expected_edges = [ + ("E0", "116154003_2", "1086007", "HAS_CLINICAL_PARAMETER_BINARY", {}), + ("E1", "116154003_4", "1086007", "HAS_CLINICAL_PARAMETER_BINARY", {}), ( - "E0", + "E2", "116154003_1", "445320007", - "HAS_CLINICAL_PARAMETER", + "HAS_CLINICAL_PARAMETER_CONTINUOUS", {"value": 150.0}, ), ( - "E1", + "E3", "116154003_2", "445320007", - "HAS_CLINICAL_PARAMETER", + "HAS_CLINICAL_PARAMETER_CONTINUOUS", {"value": 164.0}, ), ( - "E2", + "E4", + "116154003_1", + "419620001", + "HAS_CLINICAL_PARAMETER_CONTINUOUS", + {"value": 1.0}, + ), + ( + "E5", + "116154003_2", + "419620001", + "HAS_CLINICAL_PARAMETER_CONTINUOUS", + {"value": 0.0}, + ), + ( + "E6", "116154003_1", "442476006", - "HAS_CLINICAL_PARAMETER", + "HAS_CLINICAL_PARAMETER_CONTINUOUS", {"value": 97.0}, ), ( - "E3", + "E7", "116154003_2", "442476006", - "HAS_CLINICAL_PARAMETER", + "HAS_CLINICAL_PARAMETER_CONTINUOUS", {"value": 96.0}, ), ( - "E4", + "E8", "116154003_4", "442476006", - "HAS_CLINICAL_PARAMETER", + "HAS_CLINICAL_PARAMETER_CONTINUOUS", {"value": 94.0}, ), - ("E5", "116154003_1", "26449-9", "HAS_LAB_VALUE", {"value": 0.11}), - ("E6", "116154003_2", "26449-9", "HAS_LAB_VALUE", {"value": 0.12}), - ("E7", "116154003_4", "26449-9", "HAS_LAB_VALUE", {"value": 0.14}), - ("E8", "116154003_2", "B95", "HAS_DISEASE", {}), - ("E9", "116154003_1", "A02", "HAS_DISEASE", {}), - ("E10", "116154003_2", "A02", "HAS_DISEASE", {}), - ("E11", "116154003_3", "C01", "HAS_DISEASE", {}), - ("E12", "116154003_1", "1-100", "HAS_TREATMENT", {}), - ("E13", "116154003_2", "1-100", "HAS_TREATMENT", {}), + ("E9", "116154003_1", "21702-6", "HAS_LAB_VALUE_BINARY", {}), + ( + "E10", + "116154003_1", + "26449-9", + "HAS_LAB_VALUE_CONTINUOUS", + {"value": 0.11}, + ), + ( + "E11", + "116154003_2", + "26449-9", + "HAS_LAB_VALUE_CONTINUOUS", + {"value": 0.12}, + ), + ( + "E12", + "116154003_4", + "26449-9", + "HAS_LAB_VALUE_CONTINUOUS", + {"value": 0.14}, + ), + ( + "E13", + "116154003_1", + "59542-1", + "HAS_LAB_VALUE_CATEGORICAL", + {"value": 1.0}, + ), ( "E14", + "116154003_2", + "59542-1", + "HAS_LAB_VALUE_CATEGORICAL", + {"value": 2.0}, + ), + ( + "E15", + "116154003_4", + "59542-1", + "HAS_LAB_VALUE_CATEGORICAL", + {"value": 3.0}, + ), + ("E16", "116154003_2", "B95", "HAS_DISEASE", {}), + ("E17", "116154003_1", "A02", "HAS_DISEASE", {}), + ("E18", "116154003_2", "A02", "HAS_DISEASE", {}), + ("E19", "116154003_3", "C01", "HAS_DISEASE", {}), + ("E20", "116154003_1", "1-100", "HAS_TREATMENT", {}), + ("E21", "116154003_2", "1-100", "HAS_TREATMENT", {}), + ( + "E22", "116154003_1", - "not_mapped_discrete_value", + "not_mapped_binary_value", "NOT_DEFINED_BINARY", {}, ), ( - "E15", + "E23", "116154003_1", "not_mapped_continuous_value", "NOT_DEFINED_CONTINUOUS", {"value": 0.1}, ), ( - "E16", + "E24", "116154003_2", "not_mapped_continuous_value", "NOT_DEFINED_CONTINUOUS", {"value": 0.0}, ), + ( + "E25", + "116154003_1", + "not_mapped_categorical_value", + "NOT_DEFINED_CATEGORICAL", + {"value": 1.0}, + ), + ( + "E26", + "116154003_2", + "not_mapped_categorical_value", + "NOT_DEFINED_CATEGORICAL", + {"value": 2.0}, + ), + ( + "E27", + "116154003_4", + "not_mapped_categorical_value", + "NOT_DEFINED_CATEGORICAL", + {"value": 3.0}, + ), ] edges = list(adapter.get_edges()) + # order edges by the first element of the tuple + edges = sorted(edges, key=lambda x: x[0]) + print(edges) for edge in edges: assert edge in expected_edges - assert len(edges) == 17 + assert len(edges) == 28