Skip to content

Commit

Permalink
explicit split between binary, continuous and categorical
Browse files Browse the repository at this point in the history
  • Loading branch information
nilskre committed Apr 12, 2024
1 parent 160532b commit d7e977b
Show file tree
Hide file tree
Showing 4 changed files with 225 additions and 71 deletions.
66 changes: 60 additions & 6 deletions data/example_input/mapping.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,60 +4,107 @@ Nodes:
coding_system: snomedct
object_type: instance
id_in_coding_system: 116154003
col_value_type: categorical
Overall Survival (days):
coding_system: snomedct
object_type: concept
id_in_coding_system: 445320007
col_value_type: continuous
death_observed:
coding_system: snomedct
object_type: concept
id_in_coding_system: 419620001
col_value_type: continuous
Clinical_Oxygen saturation in Arterial blood ; %:
coding_system: snomedct
object_type: concept
id_in_coding_system: 442476006
col_value_type: continuous
gender_female:
coding_system: snomedct
object_type: concept
id_in_coding_system: 1086007
col_value_type: binary
# Lab values (Loinc)
LAB_Eos. Granulozyten# ; /nl:
coding_system: loinc
object_type: concept
id_in_coding_system: 26449-9
col_value_type: continuous
Grading:
coding_system: loinc
object_type: concept
id_in_coding_system: 59542-1
col_value_type: categorical
KRAS:
coding_system: loinc
object_type: concept
id_in_coding_system: 21702-6
col_value_type: binary
# Diseases (ICD)
ICD_B95:
coding_system: icd10
object_type: concept
id_in_coding_system: B95
col_value_type: binary
ICD_A02:
coding_system: icd10
object_type: concept
id_in_coding_system: A02
col_value_type: binary
Cancer_C01:
coding_system: icd10
object_type: concept
id_in_coding_system: C01
col_value_type: binary
# Operations and procedures (german OPS)
OPS_1-100:
coding_system: ops
object_type: concept
id_in_coding_system: 1-100
col_value_type: binary
# not mapped columns
not_mapped_discrete_value:
not_mapped_binary_value:
coding_system: not_mapped_to_ontology
object_type: concept
id_in_coding_system: .nan
col_value_type: binary
not_mapped_continuous_value:
coding_system: not_mapped_to_ontology
object_type: concept
id_in_coding_system: .nan
col_value_type: continuous
not_mapped_categorical_value:
coding_system: not_mapped_to_ontology
object_type: concept
id_in_coding_system: .nan
col_value_type: categorical

Edges:
HAS_CLINICAL_PARAMETER:
HAS_CLINICAL_PARAMETER_BINARY:
source_node: Patient ID
target_nodes: [gender_female]
HAS_CLINICAL_PARAMETER_CONTINUOUS:
source_node: Patient ID
target_nodes: [Overall Survival (days), Clinical_Oxygen saturation in Arterial blood ; %]
target_nodes: [Overall Survival (days), death_observed, Clinical_Oxygen saturation in Arterial blood ; %]
properties:
value:
type: float # TODO: use float because int is also float (but is this really a good solution?)
HAS_LAB_VALUE:
type: float
HAS_LAB_VALUE_BINARY:
source_node: Patient ID
target_nodes: [KRAS]
HAS_LAB_VALUE_CONTINUOUS:
source_node: Patient ID
target_nodes: [LAB_Eos. Granulozyten# ; /nl]
properties:
value:
type: float
HAS_LAB_VALUE_CATEGORICAL:
source_node: Patient ID
target_nodes: [Grading]
properties:
value:
type: float
HAS_DISEASE:
source_node: Patient ID
target_nodes: [ICD_B95, ICD_A02, Cancer_C01]
Expand All @@ -67,11 +114,18 @@ Edges:
NOT_DEFINED_BINARY:
source_node: Patient ID
target_nodes:
- not_mapped_discrete_value
- not_mapped_binary_value
NOT_DEFINED_CONTINUOUS:
properties:
value:
type: float
source_node: Patient ID
target_nodes:
- not_mapped_continuous_value
NOT_DEFINED_CATEGORICAL:
properties:
value:
type: float
source_node: Patient ID
target_nodes:
- not_mapped_categorical_value
10 changes: 5 additions & 5 deletions data/example_input/mock_data.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Patient ID,Overall Survival (days),LAB_Eos. Granulozyten# ; /nl,ICD_B95,ICD_A02,Clinical_Oxygen saturation in Arterial blood ; %,Cancer_C01,OPS_1-100,not_mapped_discrete_value,not_mapped_continuous_value
1,150,0.11,0,1,97,0,1,1,0.1
2,164,0.12,1,1,96,0,1,0,0.0
3,"",,,,,1,,,
4,,0.14,0,0,94,,0,,
Patient ID,Overall Survival (days),death_observed,Clinical_Oxygen saturation in Arterial blood ; %,gender_female,LAB_Eos. Granulozyten# ; /nl,Grading,KRAS,ICD_B95,ICD_A02,Cancer_C01,OPS_1-100,not_mapped_binary_value,not_mapped_continuous_value,not_mapped_categorical_value
1,150,1,97,0,0.11,1,1,0,1,0,1,1,0.1,1
2,164,0,96,1,0.12,2,0,1,1,0,1,0,0,2
3,,,,,,,,,,1,,,,
4,"","",94,1,0.14,3,0,0,0,,0,,,3
87 changes: 51 additions & 36 deletions patient_kg/adapters/clinical_dataset_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,49 +170,75 @@ def get_edges(self):
source_node_id = self.dataset_mapping["Nodes"][source_node][
"id_in_coding_system"
]
target_nodes_dict = {}
target_node_id_to_col_name = {}
for target_node_name in target_nodes_list:
target_node_coding_system = self.dataset_mapping["Nodes"][
target_node_name
]["coding_system"]
if "not_mapped_to_ontology" in target_node_coding_system:
target_nodes_dict[target_node_name] = target_node_name
target_node_id_to_col_name[
target_node_name
] = target_node_name
else:
target_nodes_dict[
target_node_id_to_col_name[
self.dataset_mapping["Nodes"][target_node_name][
"id_in_coding_system"
]
] = target_node_name

for target_node_id, target_node_value in target_nodes_dict.items():
for (
target_node_id,
target_node_col_name,
) in target_node_id_to_col_name.items():
target_node_col_value_type = self.dataset_mapping["Nodes"][
target_node_col_name
]["col_value_type"]

for row_index, row in self.dataset.iterrows():
edge = None
source_node_id_instance = (
f"{source_node_id}_{int(row[source_node])}"
)

properties = {}
if defined_properties is not None:
# weighted edge_name
for property in defined_properties:
if (
not pd.isna(row[target_node_value])
or row[target_node_value] == ""
):

if (
target_node_col_value_type == "continuous"
or target_node_col_value_type == "categorical"
):
if defined_properties is not None:
# weighted edge_name
for property in defined_properties:
if (
defined_properties[property]["type"]
== "int"
):
properties[property] = int(
row[target_node_value]
)
elif (
defined_properties[property]["type"]
== "float"
not pd.isna(row[target_node_col_name])
or row[target_node_col_name] == ""
):
properties[property] = float(
row[target_node_value]
)
if properties != {}:
if (
defined_properties[property]["type"]
== "int"
):
properties[property] = int(
row[target_node_col_name]
)
elif (
defined_properties[property]["type"]
== "float"
):
properties[property] = float(
row[target_node_col_name]
)
if properties != {}:
relationship_id = "E" + str(edge_id)
edge_id += 1
edge = Edge.create_instance(
relationship_id,
source_node_id_instance,
target_node_id,
edge_name,
properties,
)
if target_node_col_value_type == "binary":
if row[target_node_col_name] == 1:
# binary edge_name
relationship_id = "E" + str(edge_id)
edge_id += 1
edge = Edge.create_instance(
Expand All @@ -222,17 +248,6 @@ def get_edges(self):
edge_name,
properties,
)
if row[target_node_value] == 1:
# binary edge_name
relationship_id = "E" + str(edge_id)
edge_id += 1
edge = Edge.create_instance(
relationship_id,
source_node_id_instance,
target_node_id,
edge_name,
properties,
)

if edge is not None:
# logger.info(f"Adding edge_name {edge.get_relationship_id()}, {edge.get_source_node_id()}, {edge.get_target_node_id()}, {edge.get_label()}, {edge.get_properties()}")
Expand Down
Loading

0 comments on commit d7e977b

Please sign in to comment.