From b4d8a8205c78c384f91f89b6adf4b35e4fce7692 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Tue, 1 Oct 2024 09:50:34 -0700 Subject: [PATCH 01/24] HTAN schema translation to FHIR --- README.md | 23 +- fhirizer/htan2fhir.py | 106 +++ fhirizer/utils.py | 16 + resources/htan_resources/biospecimens.json | 406 +++++++++ resources/htan_resources/cases.json | 937 +++++++++++++++++++++ resources/htan_resources/files.json | 450 ++++++++++ setup.py | 2 +- 7 files changed, 1935 insertions(+), 5 deletions(-) create mode 100644 fhirizer/htan2fhir.py create mode 100644 resources/htan_resources/biospecimens.json create mode 100644 resources/htan_resources/cases.json create mode 100644 resources/htan_resources/files.json diff --git a/README.md b/README.md index 0d83bcb..d4eec77 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ### Project overview: -Transforms and harmonizes data from Genomic Data Commons (GDC), Cellosaurus cell-lines, and International Cancer Genome Consortium (ICGC) repositories into 🔥 FHIR (Fast Healthcare Interoperability Resources) format. +Transforms and harmonizes data from Genomic Data Commons (GDC), Cellosaurus cell-lines, International Cancer Genome Consortium (ICGC), and Human Tumor Atlas Network (HTAN) repositories into 🔥 FHIR (Fast Healthcare Interoperability Resources) format. - #### GDC study simplified FHIR graph ![mapping](./imgs/gdc_tcga_study_example_fhir_graph.png) @@ -75,6 +75,11 @@ Detailed step-by-step guide on FHIRizing data for a project's study can be found ``` fhirizer generate --name icgc --icgc --has_files ``` +- HTAN + + ``` + fhirizer generate --name htan --out_dir ./projects//META --entity_path ./projects// + ``` ### Constructing GDC maps cli cmds initialize initial structure of project, case, or file to add Maps @@ -145,9 +150,19 @@ fhirizer/ | | |-- filess.ndjson | | └── META/ | └── ICGC/ -| └── ICGC-STUDY/ -| |-- data/ -| └── META/ +| | └── ICGC-STUDY/ +| | |-- data/ +| | └── META/ +| └── HTAN/ +| | └── OHSU/ +| | └── Breast_NOS/ +| | |-- raw/ +| | | |-- files/ +| | | |-- biospecimens/ +| | | └── cases/ +| | └── META/ +| | +| | |--README.md └── setup.py ``` diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py new file mode 100644 index 0000000..19ad40a --- /dev/null +++ b/fhirizer/htan2fhir.py @@ -0,0 +1,106 @@ +import uuid +import json +import orjson +import copy +import glob +import pathlib +import inflection +import itertools +import pandas as pd +from fhirizer import utils +from pathlib import Path +import importlib.resources +from uuid import uuid3, NAMESPACE_DNS + +from fhir.resources.reference import Reference +from fhir.resources.identifier import Identifier +from fhir.resources.codeableconcept import CodeableConcept +from fhir.resources.patient import Patient +from fhir.resources.researchstudy import ResearchStudy +from fhir.resources.researchsubject import ResearchSubject +from fhir.resources.specimen import Specimen, SpecimenProcessing, SpecimenCollection +from fhir.resources.condition import Condition, ConditionStage +from fhir.resources.documentreference import DocumentReference, DocumentReferenceContent, \ + DocumentReferenceContentProfile +from fhir.resources.attachment import Attachment +from fhir.resources.observation import Observation +from fhir.resources.medicationadministration import MedicationAdministration +from fhir.resources.medication import Medication + +# File data on synapse after authentication +# https://github.com/Sage-Bionetworks/synapsePythonClient?tab=readme-ov-file#store-a-file-to-synapse + +project_id = "OHSU_Breast_NOS" +project_path = "./projects/HTAN/OHSU/Breast_NOS" + +SYSTEM_HTAN = 'https://data.humantumoratlas.org' +NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, SYSTEM_HTAN) +verbose = True + +cases_mapping = utils._read_json(str(Path(importlib.resources.files( + 'fhirizer').parent / 'resources' / 'htan_resources' / 'cases.json'))) + +biospecimens_mapping = utils._read_json(str(Path(importlib.resources.files( + 'fhirizer').parent / 'resources' / 'htan_resources' / 'biospecimens.json'))) + +files_mapping = utils._read_json(str(Path(importlib.resources.files( + 'fhirizer').parent / 'resources' / 'htan_resources' / 'files.json'))) + +# https://jen-dfci.github.io/htan_missing_manual/data_model/overview/ + +# cases_mappings +# https://data.humantumoratlas.org/standard/clinical +# cases to Patient / ResearchSubject / ResearchStudy / Observation -> Condition / Medication / MedicationAdministration / Procedure / Encounter +# 'HTAN Participant ID': #NOTE: HTAN ID associated with a patient based on HTAN ID SOP +# 'Therapeutic Agents': #NOTE: Some have multiple comma-separated Medication.ingredient +cases_path = "".join([project_path, "/raw/cases/table_data.tsv"]) +cases = pd.read_csv(cases_path, sep="\t") + +# identifiers of the cases matrix/df +patient_identifier_field = "HTAN Participant ID" + + +def get_htan_field(match, field_maps, map_info): + for field, mappings in field_maps.items(): + assert isinstance(mappings, list), f"HTAN resource mappings is not a list: {type(mappings)}, {mappings}" + for entry_map in mappings: + if entry_map[map_info] and entry_map[map_info] == match: + yield field + break + + +components_fields = [] +for key in get_htan_field(match='Condition', field_maps=cases_mapping, map_info='focus'): + components_fields.append(key) + if verbose: + print(f"Observation focus -> condition - filed': {key}") + +observation_component_df = cases[[patient_identifier_field] + components_fields] + + +for key in get_htan_field(match='Observation.component', field_maps=cases_mapping, map_info='fhir_map'): + if verbose: + print(f"field name mapped to Observation.component': {key}") + +# _component = utils.get_component(key=field, value=_component_value, component_type=utils.get_data_types(type(_component_value)), system=SYSTEM_HTAN) + +# format for onsetAge +# "onsetAge": { +# "value": 23194, +# "unit": "days", +# "system": "http://unitsofmeasure.org", +# "code": "d" +# } + +# biospecimens_mapping +# biospecimens to Specimen / Observation -> Specimen +# 'HTAN Parent ID': #NOTE: Parent could be another biospecimen or a research participant. # check for participant id for type of reference +# 'Biospecimen Type': #NOTE: Doesn't seem informative +biospecimens_path = "".join([project_path, "/raw/biospecimens/table_data.tsv"]) +biospecimens = pd.read_csv(biospecimens_path, sep="\t") +biospecimen_identifier_field = "HTAN Biospecimen ID" + +# files_mapping +# files to DocumentReference / Attachment / Observation -> DocumentReference +files_metadata = pd.read_csv("".join([project_path, "/raw/files/table_data.tsv"]), sep="\t") +files_drs_uri = pd.read_csv("".join([project_path, "/raw/files/cds_manifest.csv"])) diff --git a/fhirizer/utils.py b/fhirizer/utils.py index fe0cb60..71fcad0 100644 --- a/fhirizer/utils.py +++ b/fhirizer/utils.py @@ -1074,6 +1074,22 @@ def ncit2mondo(path): return data +def get_data_types(data_type): + if data_type in ['int64', 'int32', 'int16']: + return 'int' + elif data_type in ['float64', 'float32', 'float16']: + return 'float' + elif data_type in ['string']: + return 'string' + elif data_type == 'bool': + return 'bool' + elif data_type in ['datetime64[ns]', 'timedelta64[ns]', 'period']: + return 'dateTime' + else: + print(f"New or Null Data type: {data_type}.") + return data_type + + def get_component(key, value=None, component_type=None, system="https://cadsr.cancer.gov/sample_laboratory_observation"): if component_type == 'string': value = {"valueString": value} diff --git a/resources/htan_resources/biospecimens.json b/resources/htan_resources/biospecimens.json new file mode 100644 index 0000000..9b2d88e --- /dev/null +++ b/resources/htan_resources/biospecimens.json @@ -0,0 +1,406 @@ +{ + "HTAN Biospecimen ID": [ + { + "fhir_map": "Specimen.identifier", + "use": "official", + "focus": null + } + ], + "Atlas Name": [ + { + "fhir_map": "ResearchStudy.name", + "focus": null + } + ], + "Source HTAN Biospecimen ID": [ + { + "fhir_map": "Specimen.identifier", + "use": "secondary", + "focus": null + } + ], + "HTAN Parent ID": [ + { + "fhir_map": "Specimen.parent", + "focus": null + } + ], + "Timepoint Label": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Collection Days from Index": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Adjacent Biospecimen IDs": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Biospecimen Type": [ + { + "fhir_map": "Specimen.type", + "focus": null + } + ], + "Acquisition Method Type": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Fixative Type": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Storage Method": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Processing Days from Index": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Protocol Link": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Site Data Source": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Collection Media": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Mounting Medium": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Processing Location": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Histology Assessment By": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Histology Assessment Medium": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Preinvasive Morphology": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Tumor Infiltrating Lymphocytes": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Degree of Dysplasia": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Dysplasia Fraction": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Number Proliferating Cells": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Percent Eosinophil Infiltration": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Percent Granulocyte Infiltration": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Percent Inflam Infiltration": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Percent Lymphocyte Infiltration": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Percent Monocyte Infiltration": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Percent Necrosis": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Percent Neutrophil Infiltration": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Percent Normal Cells": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Percent Stromal Cells": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Percent Tumor Cells": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Percent Tumor Nuclei": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Fiducial Marker": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Slicing Method": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Lysis Buffer": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Method of Nucleic Acid Isolation": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Acquisition Method Other Specify": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Analyte Biospecimen Type": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Analyte Type": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Biospecimen Dimension 1": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Biospecimen Dimension 2": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Biospecimen Dimension 3": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Blood Biospecimen Type": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Bone Marrow Biospecimen Type": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Dimensions Unit": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Fixation Duration": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "HTAN Parent Biospecimen ID": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Histologic Morphology Code": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Ischemic Temperature": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Ischemic Time": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Other Acquisition Method": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Portion Weight": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Preservation Method": [ + { + "fhir_map": "Specimen.processing.method", + "focus": null + } + ], + "Section Number in Sequence": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Section Thickness Value": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Sectioning Days from Index": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Shipping Condition Type": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Slide Charge Type": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Specimen Laterality": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Tissue Biospecimen Type": [ + { + "fhir_map": "Specimen.type", + "focus": null + } + ], + "Total Volume": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Total Volume Unit": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Tumor Tissue Type": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ], + "Urine Biospecimen Type": [ + { + "fhir_map": "Observation.component", + "focus": "Specimen" + } + ] +} \ No newline at end of file diff --git a/resources/htan_resources/cases.json b/resources/htan_resources/cases.json new file mode 100644 index 0000000..e514419 --- /dev/null +++ b/resources/htan_resources/cases.json @@ -0,0 +1,937 @@ +{ + "HTAN Participant ID": [ + { + "fhir_map": "Patient.identifier", + "use": "official", + "focus": null + } + ], + "Atlas Name": [ + { + "fhir_map": "ResearchStudy.name", + "focus": null + } + ], + "Age at Diagnosis (years)": [ + { + "fhir_map": "Condition.onsetAge", + "focus": null + } + ], + "Year of Diagnosis": [ + { + "fhir_map": "Condition.recordedDate", + "focus": null + }, + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Primary Diagnosis": [ + { + "fhir_map": "Condition.code", + "focus": null + } + ], + "Precancerous Condition Type": [ + { + "fhir_map": "Condition.code", + "focus": null + } + ], + "Site of Resection or Biopsy": [ + { + "fhir_map": "Procedure.bodySite", + "focus": null + } + ], + "Tissue or Organ of Origin": [ + { + "fhir_map": "Condition.bodyStructure", + "focus": null + }, + { + "fhir_map": "Condition.bodySite", + "focus": null + } + ], + "Morphology": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Tumor Grade": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + } + ], + "Progression or Recurrence": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Last Known Disease Status": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Days to Last Follow up": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Days to Last Known Disease Status": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Method of Diagnosis": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Prior Malignancy": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Prior Treatment": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Metastasis at Diagnosis": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Metastasis at Diagnosis Site": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "First Symptom Prior to Diagnosis": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Days to Diagnosis": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Percent Tumor Invasion": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Residual Disease": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Synchronous Malignancy": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Tumor Confined to Organ of Origin": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Tumor Focality": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Tumor Largest Dimension Diameter": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Gross Tumor Weight": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Breslow Thickness": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Vascular Invasion Present": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Vascular Invasion Type": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Anaplasia Present": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Anaplasia Present Type": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Laterality": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Perineural Invasion Present": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Lymphatic Invasion Present": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Lymph Nodes Positive": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Lymph Nodes Tested": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Peritoneal Fluid Cytological Status": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Classification of Tumor": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Best Overall Response": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Mitotic Count": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "AJCC Clinical M": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "AJCC Clinical N": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "AJCC Clinical Stage": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "AJCC Clinical T": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "AJCC Pathologic M": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "AJCC Pathologic N": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "AJCC Pathologic Stage": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "AJCC Pathologic T": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "AJCC Staging System Edition": [ + { + "fhir_map": "Condition.stage.type", + "focus": null + }, + { + "fhir_map": "Observation.code", + "focus": "Condition" + } + ], + "Cog Neuroblastoma Risk Group": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Cog Rhabdomyosarcoma Risk Group": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Gleason Grade Group": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "Gleason Grade Tertiary": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "Gleason Patterns Percent": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "Greatest Tumor Dimension": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "IGCCCG Stage": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "INPC Grade": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "INPC Histologic Group": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "INRG Stage": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "INSS Stage": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "International Prognostic Index": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "IRS Group": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "IRS Stage": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "ISS Stage": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "Lymph Node Involved Site": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Margin Distance": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Margins Involved Site": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Medulloblastoma Molecular Classification": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Micropapillary Features": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Mitosis Karyorrhexis Index": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Non Nodal Regional Disease": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Non Nodal Tumor Deposits": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Ovarian Specimen Status": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Ovarian Surface Involvement": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Pregnant at Diagnosis": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Primary Gleason Grade": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "Secondary Gleason Grade": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "Supratentorial Localization": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Tumor Depth": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "WHO CNS Grade": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "WHO NTE Grade": [ + { + "fhir_map": "Condition.stage.summary", + "focus": null + }, + { + "fhir_map": "Observation.valueCodeableConcept", + "focus": "Condition" + } + ], + "Additional Topography": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Days to Progression": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Days to Progression Free": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Extent of Tumor Resection": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Mode of Cancer Detection": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "NCI Atlas Cancer Site": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + }, + { + "fhir_map": "Organization", + "focus": null + } + ], + "Other Biopsy Resection Site": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + }, + { + "fhir_map": "Organization", + "focus": null + } + ], + "Progression or Recurrence Type": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Satellite Metastasis Present Indicator": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Sentinel Lymph Node Count": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Sentinel Node Positive Assessment Count": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Topography Code": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Tumor Extranodal Extension Indicator": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Yes - Anaplasia Present": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Yes - Progression or Recurrence": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Yes - Vascular Invasion Present": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Ethnicity": [ + { + "fhir_map": "Patient.extension.valueString", + "focus": null + } + ], + "Gender": [ + { + "fhir_map": "Patient.extension.valueCode", + "focus": null + } + ], + "Race": [ + { + "fhir_map": "Patient.extension.valueString", + "focus": null + } + ], + "Vital Status": [ + { + "fhir_map": "Patient.deceasedBoolean", + "focus": null + } + ], + "Days to Birth": [ + { + "fhir_map": "Observation.component", + "focus": "Patient" + } + ], + "Country of Residence": [ + { + "fhir_map": "Patient.address.country", + "focus": null + } + ], + "Age Is Obfuscated": [ + { + "fhir_map": "Observation.component", + "focus": "Patient" + } + ], + "Year Of Birth": [ + { + "fhir_map": "Observation.component", + "focus": "Patient" + } + ], + "Occupation Duration Years": [ + { + "fhir_map": "Observation.component", + "focus": "Patient" + } + ], + "Premature At Birth": [ + { + "fhir_map": "Observation.component", + "focus": "Patient" + } + ], + "Weeks Gestation at Birth": [ + { + "fhir_map": "Observation.component", + "focus": "Patient" + } + ], + "Cause of Death": [ + { + "fhir_map": "Observation.component", + "focus": "Patient" + } + ], + "Cause of Death Source": [ + { + "fhir_map": "Observation.component", + "focus": "Patient" + } + ], + "Days to Death": [ + { + "fhir_map": "Observation.component", + "focus": "Patient" + } + ], + "Dead": [ + { + "fhir_map": "Patient.deceasedBoolean", + "focus": null + } + ], + "Year of Death": [ + { + "fhir_map": "Observation.component", + "focus": "Patient" + } + ], + "Treatment or Therapy": [ + { + "fhir_map": "Observation.component", + "focus": "Patient" + } + ], + "Treatment Type": [ + { + "fhir_map": "MedicationAdministration.category", + "focus": null + } + ], + "Treatment Effect": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Treatment Outcome": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Days to Treatment End": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Treatment Anatomic Site": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Days to Treatment Start": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Initial Disease Status": [ + { + "fhir_map": "Observation.component", + "focus": "Condition" + } + ], + "Regimen or Line of Therapy": [ + { + "fhir_map": "MedicationAdministration.category", + "focus": null + } + ], + "Therapeutic Agents": [ + { + "fhir_map": "MedicationAdministration.medication", + "focus": null + } + ], + "Treatment Intent Type": [ + { + "fhir_map": "MedicationAdministration.category", + "focus": null + } + ], + "Chemo Concurrent to Radiation": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Number of Cycles": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Reason Treatment Ended": [ + { + "fhir_map": "MedicationAdministration.statusReason", + "focus": null + } + ], + "Treatment Arm": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Treatment Dose": [ + { + "fhir_map": "MedicationAdministration.dosage", + "focus": null + } + ], + "Treatment Dose Units": [ + { + "fhir_map": "MedicationAdministration.dosage", + "focus": null + } + ], + "Treatment Effect Indicator": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Treatment Frequency": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Concomitant Medication Received Type": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Immunosuppression": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ], + "Prior Sites of Radiation": [ + { + "fhir_map": "Observation.component", + "focus": "MedicationAdministration" + } + ] +} diff --git a/resources/htan_resources/files.json b/resources/htan_resources/files.json new file mode 100644 index 0000000..8e31dec --- /dev/null +++ b/resources/htan_resources/files.json @@ -0,0 +1,450 @@ +{ + "Filename": [ + { + "fhir_map": "DocumentReference.name", + "focus": null + }, + { + "fhir_map": "DocumentReference.content.attachment.title", + "focus": null + } + ], + "Atlas Name": [ + { + "fhir_map": "ResearchStudy.name", + "focus": null + } + ], + "Biospecimen": [ + { + "fhir_map": "DocumentReference.basedOn", + "focus": null + } + ], + "Assay": [ + { + "fhir_map": "DocumentReference.category", + "focus": null + } + ], + "Level": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Organ": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Treatment": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Diagnosis": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Data Access": [ + { + "fhir_map": "DocumentReference.securityLabel", + "focus": "DocumentReference" + } + ], + "File Format": [ + { + "fhir_map": "DocumentReference.content.contentType", + "focus": "DocumentReference" + } + ], + "HTAN Participant ID": [ + { + "fhir_map": "Patient.identifier", + "use": "official", + "focus": null + } + ], + "HTAN Parent Biospecimen ID": [ + { + "fhir_map": "Specimen.parent", + "focus": null + } + ], + "HTAN Data File ID": [ + { + "fhir_map": "DocumentReference.identifier", + "use": "official", + "focus": null + } + ], + "Channel Metadata Filename": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Imaging Assay Type": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Protocol Link": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Softwareand Version": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Microscope": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Objective": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Nominal Magnification": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Lens NA": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Working Distance": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Working Distance Unit": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Immersion": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Pyramid": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Zstack": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Tseries": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Passed QC": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Comment": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "FO Vnumber": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "FOVX": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "FOVX Unit": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "FOVY": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "FOVY Unit": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Frame Averaging": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Image ID": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Dimension Order": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Physical Size X": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Physical Size X Unit": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Physical Size Y": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Physical Size Y Unit": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Physical Size Z": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Physical Size Z Unit": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Pixels Big Endian": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Plane Count": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Size C": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Size T": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Size X": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Size Y": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Size Z": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Pixel Type": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "MERFISH Positions File": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "MERFISH Codebook File": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Synapse Id": [ + { + "fhir_map": "DocumentReference.identifier", + "use": "secondary", + "focus": null + } + ], + "Atlasid": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Data File ID": [ + { + "fhir_map": "DocumentReference.identifier", + "use": "official", + "focus": null + } + ], + "Participant ID": [ + { + "fhir_map": "Patient.identifier", + "use": "official", + "focus": null + } + ], + "Parent Biospecimen ID": [ + { + "fhir_map": "Specimen.parent", + "focus": null + } + ], + "Publication Ids": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Is Raw Sequencing": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Release Version": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "HTAN Parent Data File ID": [ + { + "fhir_map": "DocumentReference.identifier", + "use": "secondary", + "focus": null + } + ], + "Imaging Segmentation Data Type": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Parameterfile": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Commit SHA": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Imaging Object Class": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Numberof Objects": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Parent Data File ID": [ + { + "fhir_map": "DocumentReference.relatesTo.target", + "focus": null + } + ], + "HTAN Parent Channel Metadata ID": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Numberof Features": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Imaging Summary Statistic": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "Metadata": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ], + "View": [ + { + "fhir_map": "Observation.component", + "focus": "DocumentReference" + } + ] +} \ No newline at end of file diff --git a/setup.py b/setup.py index 9bf1a93..9190e4c 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -__version__ = '2.0.3' +__version__ = '2.1.3' setup( name='fhirizer', From c12dabf6e2ff3e26377610a415261f0efb8b1cb3 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Tue, 1 Oct 2024 09:53:02 -0700 Subject: [PATCH 02/24] updated readme --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d4eec77..b6882a1 100644 --- a/README.md +++ b/README.md @@ -154,15 +154,15 @@ fhirizer/ | | |-- data/ | | └── META/ | └── HTAN/ -| | └── OHSU/ -| | └── Breast_NOS/ -| | |-- raw/ -| | | |-- files/ -| | | |-- biospecimens/ -| | | └── cases/ -| | └── META/ -| | -| | +| └── OHSU/ +| └── Breast_NOS/ +| |-- raw/ +| | |-- files/ +| | |-- biospecimens/ +| | └── cases/ +| └── META/ +| +| |--README.md └── setup.py ``` From ba5e80cac1931638253eeb69bfc24cd9659eb7dd Mon Sep 17 00:00:00 2001 From: teslajoy Date: Wed, 2 Oct 2024 08:28:26 -0700 Subject: [PATCH 03/24] initial htan class and patient transformer --- fhirizer/htan2fhir.py | 281 +++++++++++++++++++++++++++++++----------- fhirizer/utils.py | 8 +- 2 files changed, 212 insertions(+), 77 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index 19ad40a..e1a376a 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -11,11 +11,13 @@ from pathlib import Path import importlib.resources from uuid import uuid3, NAMESPACE_DNS +from typing import Any from fhir.resources.reference import Reference from fhir.resources.identifier import Identifier from fhir.resources.codeableconcept import CodeableConcept from fhir.resources.patient import Patient +from fhir.resources.address import Address from fhir.resources.researchstudy import ResearchStudy from fhir.resources.researchsubject import ResearchSubject from fhir.resources.specimen import Specimen, SpecimenProcessing, SpecimenCollection @@ -30,77 +32,210 @@ # File data on synapse after authentication # https://github.com/Sage-Bionetworks/synapsePythonClient?tab=readme-ov-file#store-a-file-to-synapse -project_id = "OHSU_Breast_NOS" -project_path = "./projects/HTAN/OHSU/Breast_NOS" -SYSTEM_HTAN = 'https://data.humantumoratlas.org' -NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, SYSTEM_HTAN) -verbose = True - -cases_mapping = utils._read_json(str(Path(importlib.resources.files( - 'fhirizer').parent / 'resources' / 'htan_resources' / 'cases.json'))) - -biospecimens_mapping = utils._read_json(str(Path(importlib.resources.files( - 'fhirizer').parent / 'resources' / 'htan_resources' / 'biospecimens.json'))) - -files_mapping = utils._read_json(str(Path(importlib.resources.files( - 'fhirizer').parent / 'resources' / 'htan_resources' / 'files.json'))) - -# https://jen-dfci.github.io/htan_missing_manual/data_model/overview/ - -# cases_mappings -# https://data.humantumoratlas.org/standard/clinical -# cases to Patient / ResearchSubject / ResearchStudy / Observation -> Condition / Medication / MedicationAdministration / Procedure / Encounter -# 'HTAN Participant ID': #NOTE: HTAN ID associated with a patient based on HTAN ID SOP -# 'Therapeutic Agents': #NOTE: Some have multiple comma-separated Medication.ingredient -cases_path = "".join([project_path, "/raw/cases/table_data.tsv"]) -cases = pd.read_csv(cases_path, sep="\t") - -# identifiers of the cases matrix/df -patient_identifier_field = "HTAN Participant ID" - - -def get_htan_field(match, field_maps, map_info): - for field, mappings in field_maps.items(): - assert isinstance(mappings, list), f"HTAN resource mappings is not a list: {type(mappings)}, {mappings}" - for entry_map in mappings: - if entry_map[map_info] and entry_map[map_info] == match: - yield field - break - - -components_fields = [] -for key in get_htan_field(match='Condition', field_maps=cases_mapping, map_info='focus'): - components_fields.append(key) - if verbose: - print(f"Observation focus -> condition - filed': {key}") - -observation_component_df = cases[[patient_identifier_field] + components_fields] - - -for key in get_htan_field(match='Observation.component', field_maps=cases_mapping, map_info='fhir_map'): - if verbose: - print(f"field name mapped to Observation.component': {key}") - -# _component = utils.get_component(key=field, value=_component_value, component_type=utils.get_data_types(type(_component_value)), system=SYSTEM_HTAN) - -# format for onsetAge -# "onsetAge": { -# "value": 23194, -# "unit": "days", -# "system": "http://unitsofmeasure.org", -# "code": "d" -# } - -# biospecimens_mapping -# biospecimens to Specimen / Observation -> Specimen -# 'HTAN Parent ID': #NOTE: Parent could be another biospecimen or a research participant. # check for participant id for type of reference -# 'Biospecimen Type': #NOTE: Doesn't seem informative -biospecimens_path = "".join([project_path, "/raw/biospecimens/table_data.tsv"]) -biospecimens = pd.read_csv(biospecimens_path, sep="\t") -biospecimen_identifier_field = "HTAN Biospecimen ID" - -# files_mapping -# files to DocumentReference / Attachment / Observation -> DocumentReference -files_metadata = pd.read_csv("".join([project_path, "/raw/files/table_data.tsv"]), sep="\t") -files_drs_uri = pd.read_csv("".join([project_path, "/raw/files/cds_manifest.csv"])) +class HTANTransformer: + def __init__(self, subprogram_name: str, project_id: str, verbose: bool): + self.mint_id = utils.mint_id + self._mint_id = utils._mint_id + self.get_data_type = utils.get_data_types + self.get_component = utils.get_component + self.fhir_ndjson = utils.fhir_ndjson + self.subprogram_name = subprogram_name + self.project_id = project_id + self.verbose = verbose + self.SYSTEM_HTAN = 'https://data.humantumoratlas.org' + self.NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, self.SYSTEM_HTAN) + self.project_id = project_id + self.read_json = utils._read_json + + self.project_path = str( + Path(importlib.resources.files('fhirizer').parent / 'projects' / 'HTAN' / subprogram_name / project_id)) + assert Path(self.project_path).is_dir(), f"Path {self.project_path} is not a valid directory path." + + self.cases_path = str( + Path(importlib.resources.files('fhirizer').parent / 'resources' / 'htan_resources' / 'cases.json')) + assert Path(self.cases_path).is_file(), f"Path {self.cases_path} does not exist." + + self.biospecimens_path = str( + Path(importlib.resources.files('fhirizer').parent / 'resources' / 'htan_resources' / 'biospecimens.json')) + assert Path(self.biospecimens_path).is_file(), f"Path {self.biospecimens_path} does not exist." + + self.files_path = str( + Path(importlib.resources.files('fhirizer').parent / 'resources' / 'htan_resources' / 'files.json')) + assert Path(self.files_path).is_file(), f"Path {self.files_path} does not exist." + + self.cases_mappings = self.get_cases_mappings + + # cases_mappings + # https://data.humantumoratlas.org/standard/clinical + # cases to Patient / ResearchSubject / ResearchStudy / Observation -> Condition / Medication / MedicationAdministration / Procedure / Encounter + # 'HTAN Participant ID': #NOTE: HTAN ID associated with a patient based on HTAN ID SOP + # 'Therapeutic Agents': #NOTE: Some have multiple comma-separated Medication.ingredient + self.cases_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/cases/table_data.tsv") + assert self.cases_table_data_path.is_file(), f"Path {self.cases_table_data_path} is not a valid file path." + self.cases = self.get_dataframe(self.cases_table_data_path, sep="\t") + self.patient_identifier_field = "HTAN Participant ID" # identifiers of the cases matrix/df + + self.biospecimen_mappings = self.get_biospecimen_mappings + + # biospecimens_mapping + # biospecimens to Specimen / Observation -> Specimen + # 'HTAN Parent ID': #NOTE: Parent could be another biospecimen or a research participant. # check for participant id for type of reference + # 'Biospecimen Type': #NOTE: Doesn't seem informative + self.biospecimens_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath( + "./raw/biospecimens/table_data.tsv") + assert self.biospecimens_table_data_path.is_file(), f"Path {self.biospecimens_table_data_path} is not a valid file path." + self.biospecimens = self.get_dataframe(self.biospecimens_table_data_path, sep="\t") + self.biospecimen_identifier_field = "HTAN Biospecimen ID" + + self.files_mappings = self.get_files_mappings + + # files_mapping + # files to DocumentReference / Attachment / Observation -> DocumentReference + + self.files_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/files/table_data.tsv") + self.files_drs_uri_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/files/cds_manifest.csv") + assert self.files_table_data_path.is_file(), f"Path {self.files_table_data_path} is not a valid file path." + assert self.files_drs_uri_path.is_file(), f"Path {self.files_drs_uri_path} is not a valid file path." + + self.files = self.get_dataframe(self.files_table_data_path, sep="\t") + self.files_drs_uri = pd.read_csv(self.files_drs_uri_path, sep=",") + + self.patient_demographics = self.get_patient_demographics() + + def get_cases_mappings(self) -> dict: + """HTAN cases FHIR mapping""" + return self.read_json(self.cases_path) + + def get_biospecimen_mappings(self) -> dict: + """HTAN biospesimens FHIR mapping""" + return self.read_json(self.biospecimens_path) + + def get_files_mappings(self) -> dict: + """HTAN files FHIR mapping""" + return self.read_json(self.files_path) + + @staticmethod + def get_dataframe(_path, sep) -> pd.DataFrame: + """Returns a Pandas DataFrame with lower-case and inflection.underscore columns for standard UI input""" + _data = pd.read_csv(_path, sep=sep) + # _data.columns = _data.columns.to_series().apply(lambda x: inflection.underscore(inflection.parameterize(x))) + return _data + + def get_patient_demographics(self) -> pd.DataFrame: + """HTAN cases table_data.tsv data with Patient FHIR demographics mappings column/field match""" + field_list = [] + for field in self.get_htan_mapping(match='Patient', field_maps=self.cases_mappings(), map_info='fhir_map', fetch='field'): + field_list.append(field) + if self.verbose: + print(f"field name': {field}") + + patient_demographics = self.cases[field_list] + return patient_demographics + + @staticmethod + def get_htan_mapping(match, field_maps, map_info, fetch): + """Yields FHIR HTAN maps from HTAN field or FHIR mapping string""" + for field, mappings in field_maps.items(): + assert isinstance(mappings, list), f"HTAN resource mappings is not a list: {type(mappings)}, {mappings}" + for entry_map in mappings: + if entry_map[map_info] and match in entry_map[map_info]: + if fetch == "field": + yield field + break + elif fetch == "mapping": + yield entry_map + break + + @staticmethod + def get_fields_by_fhir_map(mapping_data, fhir_mapping=None): + """ + Yields the field(s) associated with a specific HTAN FHIR map or all HTAN FHIR maps + + Return: Yields the field, FHIR map, identifier use, and focus. + example: + for field, fhir_map, use, focus in get_fields_by_fhir_map(cases_mapping, "Observation.component"): + print(f"Field: {field}, FHIR Map: {fhir_map}, Identifier use: {use}, Focus: {focus}") + """ + for _field, mappings in mapping_data.items(): + for mapping in mappings: + _current_fhir_map = mapping["fhir_map"] + _focus = mapping.get("focus", None) + _use = mapping.get("use", None) + + if fhir_mapping is None or _current_fhir_map == fhir_mapping: + yield _field, _current_fhir_map, _use, _focus + + @staticmethod + def get_fhir_maps_by_field(mapping_data, field_name=None): + """ + Yields the FHIR map(s) associated with a specific HTAN field or all HTAN FHIR maps + + Return: Yields the field, FHIR map, identifier use, and focus. + example use: + for field, fhir_map, use, focus in get_fhir_maps_by_field(cases_mapping, "Year of Diagnosis"): + print(f"Field: {field}, FHIR Map: {fhir_map}, Identifier use: {use}, Focus: {focus}") + """ + for _field, mappings in mapping_data.items(): + if field_name is None or _field == field_name: + for mapping in mappings: + _fhir_map = mapping["fhir_map"] + _focus = mapping.get("focus", None) + _use = mapping.get("use", None) + yield _field, _fhir_map, _use, _focus + + +class PatientTransformer(HTANTransformer): + def __init__(self, *args: Any, **kwargs: Any): + super().__init__(**kwargs) + self.cases_mapping = self.cases_mappings + self.NAMESPACE_HTAN = self.NAMESPACE_HTAN + + def create_patient(self, _row: pd.Series) -> Patient: + """Transform HTAN case demographics to FHIR Patient""" + use = None + for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(self.cases_mapping(), "Patient.identifier"): + use = _use + assert use, f"Patient.identifier use is not defined in ./resources/HTAN/cases.json mappings." + + patient_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "value": _row['HTAN Participant ID'], "use": use}) + patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + + deceasedBoolean_fields = [] + for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(self.cases_mapping(), "Patient.deceasedBoolean"): + deceasedBoolean_fields.append(_field) + assert deceasedBoolean_fields, f"Patient.deceasedBoolean has no fields defined in ./resources/HTAN/cases.json mappings." + + + vital_status = _row[deceasedBoolean_fields].dropna().unique().any() + deceasedBoolean = {"Dead": True}.get(vital_status, False if vital_status else None) + + # TODO: us-core-ethnicity and race resource + ethnicity = _row.get("Ethnicity") + race = _row.get("Race") + + address_country = _row.get("Country of Residence") + address = Address(**{"country": address_country}) + + return Patient(**{"id": patient_id, + "identifier": [patient_identifier], + "deceasedBoolean": deceasedBoolean, + "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", + "valueString": ethnicity}, + {"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", + "valueString": race} + ], + "address": [address]}) + + +transformer = HTANTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False) +patient_transformer = PatientTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False) +patient_demographics_df = transformer.patient_demographics + +patients = [] +for index, row in patient_demographics_df.iterrows(): + patient = patient_transformer.create_patient(_row=row) + if patient: + patients.append(orjson.loads(patient.json())) + print(f"HTAN FHIR Patient: {patient.json()}") \ No newline at end of file diff --git a/fhirizer/utils.py b/fhirizer/utils.py index 71fcad0..92f1744 100644 --- a/fhirizer/utils.py +++ b/fhirizer/utils.py @@ -1075,15 +1075,15 @@ def ncit2mondo(path): def get_data_types(data_type): - if data_type in ['int64', 'int32', 'int16']: + if data_type in ['int64', 'int32', 'int16', 'int']: return 'int' - elif data_type in ['float64', 'float32', 'float16']: + elif data_type in ['float64', 'float32', 'float16', 'float']: return 'float' - elif data_type in ['string']: + elif data_type in ['str', 'string']: return 'string' elif data_type == 'bool': return 'bool' - elif data_type in ['datetime64[ns]', 'timedelta64[ns]', 'period']: + elif data_type in ['datetime64[ns]', 'timedelta64[ns]', 'period', 'datetime', 'date']: return 'dateTime' else: print(f"New or Null Data type: {data_type}.") From ccf849e924d89ba11b4aefc5719091188c3b7b64 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Wed, 2 Oct 2024 09:37:31 -0700 Subject: [PATCH 04/24] patient observation --- fhirizer/htan2fhir.py | 63 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index e1a376a..d209512 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -190,6 +190,9 @@ def __init__(self, *args: Any, **kwargs: Any): super().__init__(**kwargs) self.cases_mapping = self.cases_mappings self.NAMESPACE_HTAN = self.NAMESPACE_HTAN + self.get_data_types = utils.get_data_types + self.get_component = self.get_component + self.get_fields_by_fhir_map = self.get_fields_by_fhir_map def create_patient(self, _row: pd.Series) -> Patient: """Transform HTAN case demographics to FHIR Patient""" @@ -228,14 +231,68 @@ def create_patient(self, _row: pd.Series) -> Patient: ], "address": [address]}) + def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation: + patient_observation_fields = [] + for field, fhir_map, use, focus in self.get_fields_by_fhir_map(transformer.cases_mappings(), "Observation.component"): + if focus == "Patient": + patient_observation_fields.append(field) + + if patient_observation_fields: + _obervation_row = _row[patient_observation_fields] + + components = [] + for key, value in _obervation_row.to_dict().items(): + if key != 'HTAN Participant ID': + if isinstance(value, float) and not pd.isna(value) and ("Year" in key or "Day" in key or "year" in key or "day" in key): + value = int(value) + _component = self.get_component(key=key, value=value, component_type=self.get_data_types(type(value).__name__), system=self.SYSTEM_HTAN) + components.append(_component) + + observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.id}) + observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + + return Observation(**{"id": observation_id, + "identifier": [observation_identifier], + "status": "final", + "category": [ + { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "code": "exam", + "display": "exam" + } + ], + "text": "Exam" + } + ], + "code": { + "coding": [ + { + "system": "http://loinc.org", + "code": "52460-3", # TODO: may need to change to be more specific + "display": "patient information" + } + ], + "text": "Patient Information" + }, + "focus": [Reference(**{"reference": f"Patient/{patient.id}"})], + "subject": Reference(**{"reference": f"Patient/{patient.id}"}), + "component": components}) + transformer = HTANTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False) patient_transformer = PatientTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False) patient_demographics_df = transformer.patient_demographics +cases = transformer.cases patients = [] -for index, row in patient_demographics_df.iterrows(): - patient = patient_transformer.create_patient(_row=row) +for index, row in cases.iterrows(): + patient_row = cases.iloc[index][patient_demographics_df.columns] + patient = patient_transformer.create_patient(_row=patient_row) + patient_obs = patient_transformer.patient_observation(patient=patient, _row=row) if patient: patients.append(orjson.loads(patient.json())) - print(f"HTAN FHIR Patient: {patient.json()}") \ No newline at end of file + print(f"HTAN FHIR Patient: {patient.json()}") + print(f"HTAN FHIR Patient Observation: {patient_obs.json()}") From af3d61d0ccb5e738d5a5e2f655033cbf8d8607d9 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Wed, 2 Oct 2024 13:33:10 -0700 Subject: [PATCH 05/24] researchstudy and researchsubject identifiers --- fhirizer/entity2fhir.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fhirizer/entity2fhir.py b/fhirizer/entity2fhir.py index 3ce22c1..a97465e 100644 --- a/fhirizer/entity2fhir.py +++ b/fhirizer/entity2fhir.py @@ -84,7 +84,7 @@ def assign_fhir_for_project(project, disease_types=disease_types): pr_ident = Identifier(**{"system": "".join(["https://gdc.cancer.gov/", "program_id"]), "value": project['ResearchStudy']['ResearchStudy.id']}) pl.append(pr_ident) - + rs.identifier = [pr_ident] rs.id = utils.mint_id( identifier=pr_ident, resource_type="ResearchStudy", @@ -94,6 +94,7 @@ def assign_fhir_for_project(project, disease_types=disease_types): else: p_ident = Identifier(**{"system": "".join(["https://gdc.cancer.gov/", "project_id"]), "value": project['ResearchStudy.id']}) + rs.identifier = [p_ident] rs.id = utils.mint_id( identifier=p_ident, resource_type="ResearchStudy", @@ -157,6 +158,7 @@ def assign_fhir_for_project(project, disease_types=disease_types): ref = Reference(**{"reference": "/".join(["ResearchStudy", rs_parent.id])}) rs.partOf = [ref] + # condition -- subject --> patient <--subject-- researchsubject -- study --> researchstudy -- partOf --> researchstudy return {'ResearchStudy': rs.json(), "ResearchStudy.partOf": rs_parent.json(), 'ResearchStudy_obj': rs, @@ -394,6 +396,7 @@ def assign_fhir_for_case(case, disease_types=disease_types, primary_sites=primar research_subject.status = "active" research_subject.study = study_ref research_subject.subject = subject_ref + research_subject.identifier = [patient_id_identifier] research_subject.id = utils.mint_id( identifier=patient_id_identifier, resource_type="ResearchSubject", @@ -1890,6 +1893,7 @@ def assign_fhir_for_file(file): for case in file['cases']: patient_id_identifier = Identifier.construct() patient_id_identifier.value = case['Patient.id'] + patient_id_identifier.use = "official" patient_id_identifier.system = "".join(["https://gdc.cancer.gov/", "case_id"]) patient_id = utils.mint_id(identifier=patient_id_identifier, resource_type="Patient", project_id=project_id, From 4daa0693f9c458c064b6900a9a86b2c254fa6f08 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Wed, 2 Oct 2024 14:01:53 -0700 Subject: [PATCH 06/24] researchstudy - researchsubject - patient for list a of htan atlas names --- README.md | 13 ++--- fhirizer/htan2fhir.py | 122 +++++++++++++++++++++++++++++++++--------- 2 files changed, 104 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index b6882a1..3968812 100644 --- a/README.md +++ b/README.md @@ -155,12 +155,13 @@ fhirizer/ | | └── META/ | └── HTAN/ | └── OHSU/ -| └── Breast_NOS/ -| |-- raw/ -| | |-- files/ -| | |-- biospecimens/ -| | └── cases/ -| └── META/ +| |-- raw/ +| | |-- files/ +| | | |-- table_data.tsv +| | | └── cds_manifest.csv +| | |-- biospecimens/table_data.tsv +| | └── cases/table_data.tsv +| └── META/ | | |--README.md diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index d209512..2d68c9c 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -34,22 +34,24 @@ class HTANTransformer: - def __init__(self, subprogram_name: str, project_id: str, verbose: bool): + def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): self.mint_id = utils.mint_id self._mint_id = utils._mint_id self.get_data_type = utils.get_data_types self.get_component = utils.get_component self.fhir_ndjson = utils.fhir_ndjson self.subprogram_name = subprogram_name - self.project_id = project_id + self.project_id = subprogram_name # incase there will be more granular project/program relations + assert Path(out_dir).is_dir(), f"Path to out_dir {out_dir} is not a directory." + self.out_dir = out_dir self.verbose = verbose self.SYSTEM_HTAN = 'https://data.humantumoratlas.org' self.NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, self.SYSTEM_HTAN) - self.project_id = project_id self.read_json = utils._read_json + self.fhir_ndjson = utils.fhir_ndjson self.project_path = str( - Path(importlib.resources.files('fhirizer').parent / 'projects' / 'HTAN' / subprogram_name / project_id)) + Path(importlib.resources.files('fhirizer').parent / 'projects' / 'HTAN' / subprogram_name)) assert Path(self.project_path).is_dir(), f"Path {self.project_path} is not a valid directory path." self.cases_path = str( @@ -71,7 +73,7 @@ def __init__(self, subprogram_name: str, project_id: str, verbose: bool): # cases to Patient / ResearchSubject / ResearchStudy / Observation -> Condition / Medication / MedicationAdministration / Procedure / Encounter # 'HTAN Participant ID': #NOTE: HTAN ID associated with a patient based on HTAN ID SOP # 'Therapeutic Agents': #NOTE: Some have multiple comma-separated Medication.ingredient - self.cases_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/cases/table_data.tsv") + self.cases_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/cases/table_data.tsv") assert self.cases_table_data_path.is_file(), f"Path {self.cases_table_data_path} is not a valid file path." self.cases = self.get_dataframe(self.cases_table_data_path, sep="\t") self.patient_identifier_field = "HTAN Participant ID" # identifiers of the cases matrix/df @@ -82,7 +84,7 @@ def __init__(self, subprogram_name: str, project_id: str, verbose: bool): # biospecimens to Specimen / Observation -> Specimen # 'HTAN Parent ID': #NOTE: Parent could be another biospecimen or a research participant. # check for participant id for type of reference # 'Biospecimen Type': #NOTE: Doesn't seem informative - self.biospecimens_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath( + self.biospecimens_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath( "./raw/biospecimens/table_data.tsv") assert self.biospecimens_table_data_path.is_file(), f"Path {self.biospecimens_table_data_path} is not a valid file path." self.biospecimens = self.get_dataframe(self.biospecimens_table_data_path, sep="\t") @@ -93,8 +95,8 @@ def __init__(self, subprogram_name: str, project_id: str, verbose: bool): # files_mapping # files to DocumentReference / Attachment / Observation -> DocumentReference - self.files_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/files/table_data.tsv") - self.files_drs_uri_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/files/cds_manifest.csv") + self.files_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/files/table_data.tsv") + self.files_drs_uri_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/files/cds_manifest.csv") assert self.files_table_data_path.is_file(), f"Path {self.files_table_data_path} is not a valid file path." assert self.files_drs_uri_path.is_file(), f"Path {self.files_drs_uri_path} is not a valid file path." @@ -183,6 +185,29 @@ def get_fhir_maps_by_field(mapping_data, field_name=None): _focus = mapping.get("focus", None) _use = mapping.get("use", None) yield _field, _fhir_map, _use, _focus + @staticmethod + def decipher_htan_id(_id) -> dict: + """ + ::= _integer + ::= _integer + wild-card string ex. '0000' is used for the same file derived from multiple participants + substring 'EXT' is used for external participants + """ + deciphered_id = {} + _id_substrings = _id.split("_") + participant_id = "_".join([_id_substrings[0],_id_substrings[1]]) + if 'EXT' not in _id_substrings[1] or '0000' not in _id_substrings[1]: + deciphered_id = {"participant_id": participant_id, "subsets": _id_substrings} + else: + participant_id = "_".join([_id_substrings[0], _id_substrings[1], _id_substrings[2]]) + deciphered_id = {"participant_id": participant_id, "subsets": _id_substrings} + return deciphered_id + + def write_ndjson(self, entities): + resource_type = entities[0].resource_type + entities = [orjson.loads(entity.json()) for entity in entities] + entities = list({v['id']: v for v in entities}.values()) + utils.fhir_ndjson(entities, "".join([self.out_dir, "/", resource_type, ".ndjson"])) class PatientTransformer(HTANTransformer): @@ -219,7 +244,7 @@ def create_patient(self, _row: pd.Series) -> Patient: race = _row.get("Race") address_country = _row.get("Country of Residence") - address = Address(**{"country": address_country}) + address = [Address(**{"country": address_country})] if not pd.isna(address_country) else [] return Patient(**{"id": patient_id, "identifier": [patient_identifier], @@ -229,7 +254,7 @@ def create_patient(self, _row: pd.Series) -> Patient: {"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "valueString": race} ], - "address": [address]}) + "address": address}) def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation: patient_observation_fields = [] @@ -281,18 +306,65 @@ def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation: "subject": Reference(**{"reference": f"Patient/{patient.id}"}), "component": components}) - -transformer = HTANTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False) -patient_transformer = PatientTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False) -patient_demographics_df = transformer.patient_demographics -cases = transformer.cases - -patients = [] -for index, row in cases.iterrows(): - patient_row = cases.iloc[index][patient_demographics_df.columns] - patient = patient_transformer.create_patient(_row=patient_row) - patient_obs = patient_transformer.patient_observation(patient=patient, _row=row) - if patient: - patients.append(orjson.loads(patient.json())) - print(f"HTAN FHIR Patient: {patient.json()}") - print(f"HTAN FHIR Patient Observation: {patient_obs.json()}") + def create_researchstudy(self, _row: pd.Series) -> ResearchStudy: + study_field = None + for field, fhir_map, use, focus in self.get_fields_by_fhir_map(self.cases_mappings(), "ResearchStudy.name"): + study_field = field + study_name = _row.get(study_field) + researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": study_name}) + researchstudy_id = self.mint_id(identifier=researchstudy_identifier, resource_type="ResearchStudy", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + return ResearchStudy(**{"id": researchstudy_id, + "identifier": [researchstudy_identifier], + "name": study_name, + "status": "open"}) # TODO: add "condition" snomed id + + def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> ResearchSubject: + researchsubject_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.identifier[0].value}) + researchsubject_id = self.mint_id(identifier=researchsubject_identifier, resource_type="ResearchSubject", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + return ResearchSubject(**{"id": researchsubject_id, + "identifier": [researchsubject_identifier], + "status": "active", + "subject": Reference(**{"reference": f"Patient/{patient.id}"}), + "study": Reference(**{"reference": f"ResearchStudy/{study.id}"})}) + + +atlas_name = ["OHSU", "DFCI", "WUSTL"] +for name in atlas_name: + + transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) + patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) + + patient_demographics_df = transformer.patient_demographics + cases = transformer.cases + + patients = [] + research_studies = [] + research_subjects = [] + for index, row in cases.iterrows(): + + research_study = patient_transformer.create_researchstudy(_row=row) + + if research_study: + research_studies.append(research_study) + + patient_row = cases.iloc[index][patient_demographics_df.columns] + patient = patient_transformer.create_patient(_row=patient_row) + patient_obs = patient_transformer.patient_observation(patient=patient, _row=row) + if patient: + patients.append(patient) + print(f"HTAN FHIR Patient: {patient.json()}") + print(f"HTAN FHIR Patient Observation: {patient_obs.json()}") + + research_subject = patient_transformer.create_researchsubject(patient, research_study) + if research_subject: + research_subjects.append(research_subject) + + transformer.write_ndjson(research_subjects) + transformer.write_ndjson(research_studies) + transformer.write_ndjson(patients) + + # participant ids from specimen identifiers + # transformer.decipher_htan_id(transformer.biospecimens["HTAN Biospecimen ID"][0]) + # transformer.decipher_htan_id(transformer.cases["HTAN Participant ID"][0]) \ No newline at end of file From 8b276b650b4730b0fd31345573fadd5752f45495 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Thu, 3 Oct 2024 06:32:56 -0700 Subject: [PATCH 07/24] transform all available HTAN atlases --- fhirizer/htan2fhir.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index 2d68c9c..8b05501 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -329,8 +329,9 @@ def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> Rese "subject": Reference(**{"reference": f"Patient/{patient.id}"}), "study": Reference(**{"reference": f"ResearchStudy/{study.id}"})}) - -atlas_name = ["OHSU", "DFCI", "WUSTL"] +# 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) +# 12/14 total Atlas +atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"] for name in atlas_name: transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) From 122a220d81c715f9795066b6d2de201cef3d03a8 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Thu, 3 Oct 2024 09:52:17 -0700 Subject: [PATCH 08/24] initial encounter and condition --- fhirizer/htan2fhir.py | 201 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 178 insertions(+), 23 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index 8b05501..71c8ead 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -1,5 +1,7 @@ import uuid import json + +import numpy as np import orjson import copy import glob @@ -11,24 +13,30 @@ from pathlib import Path import importlib.resources from uuid import uuid3, NAMESPACE_DNS -from typing import Any +from typing import Any, List, Optional +from datetime import datetime from fhir.resources.reference import Reference from fhir.resources.identifier import Identifier -from fhir.resources.codeableconcept import CodeableConcept from fhir.resources.patient import Patient from fhir.resources.address import Address from fhir.resources.researchstudy import ResearchStudy from fhir.resources.researchsubject import ResearchSubject +from fhir.resources.observation import Observation +from fhir.resources.encounter import Encounter +from fhir.resources.codeableconcept import CodeableConcept +from fhir.resources.age import Age +from fhir.resources.procedure import Procedure +from fhir.resources.bodystructure import BodyStructure, BodyStructureIncludedStructure from fhir.resources.specimen import Specimen, SpecimenProcessing, SpecimenCollection from fhir.resources.condition import Condition, ConditionStage from fhir.resources.documentreference import DocumentReference, DocumentReferenceContent, \ DocumentReferenceContentProfile from fhir.resources.attachment import Attachment -from fhir.resources.observation import Observation from fhir.resources.medicationadministration import MedicationAdministration from fhir.resources.medication import Medication + # File data on synapse after authentication # https://github.com/Sage-Bionetworks/synapsePythonClient?tab=readme-ov-file#store-a-file-to-synapse @@ -41,11 +49,13 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): self.get_component = utils.get_component self.fhir_ndjson = utils.fhir_ndjson self.subprogram_name = subprogram_name - self.project_id = subprogram_name # incase there will be more granular project/program relations + self.project_id = subprogram_name # incase there will be more granular project/program relations assert Path(out_dir).is_dir(), f"Path to out_dir {out_dir} is not a directory." self.out_dir = out_dir self.verbose = verbose self.SYSTEM_HTAN = 'https://data.humantumoratlas.org' + self.SYSTEM_SNOME = 'http://snomed.info/sct' + self.SYSTEM_LOINC = 'http://loinc.org' self.NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, self.SYSTEM_HTAN) self.read_json = utils._read_json self.fhir_ndjson = utils.fhir_ndjson @@ -73,10 +83,11 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): # cases to Patient / ResearchSubject / ResearchStudy / Observation -> Condition / Medication / MedicationAdministration / Procedure / Encounter # 'HTAN Participant ID': #NOTE: HTAN ID associated with a patient based on HTAN ID SOP # 'Therapeutic Agents': #NOTE: Some have multiple comma-separated Medication.ingredient - self.cases_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/cases/table_data.tsv") + self.cases_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath( + "./raw/cases/table_data.tsv") assert self.cases_table_data_path.is_file(), f"Path {self.cases_table_data_path} is not a valid file path." self.cases = self.get_dataframe(self.cases_table_data_path, sep="\t") - self.patient_identifier_field = "HTAN Participant ID" # identifiers of the cases matrix/df + self.patient_identifier_field = "HTAN Participant ID" # identifiers of the cases matrix/df self.biospecimen_mappings = self.get_biospecimen_mappings @@ -95,8 +106,10 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): # files_mapping # files to DocumentReference / Attachment / Observation -> DocumentReference - self.files_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/files/table_data.tsv") - self.files_drs_uri_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/files/cds_manifest.csv") + self.files_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath( + "./raw/files/table_data.tsv") + self.files_drs_uri_path = Path(Path(self.project_path).parent / subprogram_name).joinpath( + "./raw/files/cds_manifest.csv") assert self.files_table_data_path.is_file(), f"Path {self.files_table_data_path} is not a valid file path." assert self.files_drs_uri_path.is_file(), f"Path {self.files_drs_uri_path} is not a valid file path." @@ -127,7 +140,8 @@ def get_dataframe(_path, sep) -> pd.DataFrame: def get_patient_demographics(self) -> pd.DataFrame: """HTAN cases table_data.tsv data with Patient FHIR demographics mappings column/field match""" field_list = [] - for field in self.get_htan_mapping(match='Patient', field_maps=self.cases_mappings(), map_info='fhir_map', fetch='field'): + for field in self.get_htan_mapping(match='Patient', field_maps=self.cases_mappings(), map_info='fhir_map', + fetch='field'): field_list.append(field) if self.verbose: print(f"field name': {field}") @@ -185,6 +199,24 @@ def get_fhir_maps_by_field(mapping_data, field_name=None): _focus = mapping.get("focus", None) _use = mapping.get("use", None) yield _field, _fhir_map, _use, _focus + + def get_field_value(self, _row: pd.Series, mapping_type: str, fhir_field: str) -> dict: + mapping_data = None + if mapping_type == "case": + mapping_data = self.cases_mappings() + elif mapping_data == "biospecimen": + mapping_data = self.biospecimen_mappings() + elif mapping_type == "file": + mapping_data = self.files_mappings() + + _this_htan_field = None + for field, fhir_map, use, focus in self.get_fields_by_fhir_map(mapping_data=mapping_data, + fhir_mapping=fhir_field): + _this_htan_field = field + _filed_value = _row.get(_this_htan_field) + + return {"htan_field": _this_htan_field, "htan_field_value": _filed_value} + @staticmethod def decipher_htan_id(_id) -> dict: """ @@ -195,7 +227,7 @@ def decipher_htan_id(_id) -> dict: """ deciphered_id = {} _id_substrings = _id.split("_") - participant_id = "_".join([_id_substrings[0],_id_substrings[1]]) + participant_id = "_".join([_id_substrings[0], _id_substrings[1]]) if 'EXT' not in _id_substrings[1] or '0000' not in _id_substrings[1]: deciphered_id = {"participant_id": participant_id, "subsets": _id_substrings} else: @@ -226,16 +258,17 @@ def create_patient(self, _row: pd.Series) -> Patient: use = _use assert use, f"Patient.identifier use is not defined in ./resources/HTAN/cases.json mappings." - patient_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "value": _row['HTAN Participant ID'], "use": use}) + patient_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "value": _row['HTAN Participant ID'], "use": use}) patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id, - namespace=self.NAMESPACE_HTAN) + namespace=self.NAMESPACE_HTAN) deceasedBoolean_fields = [] - for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(self.cases_mapping(), "Patient.deceasedBoolean"): + for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(self.cases_mapping(), + "Patient.deceasedBoolean"): deceasedBoolean_fields.append(_field) assert deceasedBoolean_fields, f"Patient.deceasedBoolean has no fields defined in ./resources/HTAN/cases.json mappings." - vital_status = _row[deceasedBoolean_fields].dropna().unique().any() deceasedBoolean = {"Dead": True}.get(vital_status, False if vital_status else None) @@ -258,7 +291,8 @@ def create_patient(self, _row: pd.Series) -> Patient: def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation: patient_observation_fields = [] - for field, fhir_map, use, focus in self.get_fields_by_fhir_map(transformer.cases_mappings(), "Observation.component"): + for field, fhir_map, use, focus in self.get_fields_by_fhir_map(transformer.cases_mappings(), + "Observation.component"): if focus == "Patient": patient_observation_fields.append(field) @@ -268,9 +302,12 @@ def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation: components = [] for key, value in _obervation_row.to_dict().items(): if key != 'HTAN Participant ID': - if isinstance(value, float) and not pd.isna(value) and ("Year" in key or "Day" in key or "year" in key or "day" in key): + if isinstance(value, float) and not pd.isna(value) and ( + "Year" in key or "Day" in key or "year" in key or "day" in key): value = int(value) - _component = self.get_component(key=key, value=value, component_type=self.get_data_types(type(value).__name__), system=self.SYSTEM_HTAN) + _component = self.get_component(key=key, value=value, + component_type=self.get_data_types(type(value).__name__), + system=self.SYSTEM_HTAN) components.append(_component) observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.id}) @@ -295,8 +332,8 @@ def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation: "code": { "coding": [ { - "system": "http://loinc.org", - "code": "52460-3", # TODO: may need to change to be more specific + "system": self.SYSTEM_LOINC, + "code": "52460-3", # TODO: may need to change to be more specific "display": "patient information" } ], @@ -317,10 +354,11 @@ def create_researchstudy(self, _row: pd.Series) -> ResearchStudy: return ResearchStudy(**{"id": researchstudy_id, "identifier": [researchstudy_identifier], "name": study_name, - "status": "open"}) # TODO: add "condition" snomed id + "status": "open"}) # TODO: add "condition" snomed id def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> ResearchSubject: - researchsubject_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.identifier[0].value}) + researchsubject_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.identifier[0].value}) researchsubject_id = self.mint_id(identifier=researchsubject_identifier, resource_type="ResearchSubject", project_id=self.project_id, namespace=self.NAMESPACE_HTAN) return ResearchSubject(**{"id": researchsubject_id, @@ -329,13 +367,115 @@ def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> Rese "subject": Reference(**{"reference": f"Patient/{patient.id}"}), "study": Reference(**{"reference": f"ResearchStudy/{study.id}"})}) + def create_encounter(self, _row: pd.Series, patient: Patient, condition: Optional[Condition], + procedure: Optional[Procedure]) -> Encounter: + # identifier string = project / patient / [condition/procedure] - assume parent encounter atm + condition_procedure = "" + if condition: + condition_procedure = condition.id + elif procedure: + condition_procedure = procedure.id + + encounter_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", + "value": "/".join([self.subprogram_name, patient.identifier[0].value])}) + encounter_id = self.mint_id(identifier=encounter_identifier, resource_type="Encounter", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + + return Encounter(**{"id": encounter_id, + "identifier": [encounter_identifier], + "status": "completed", + "subject": Reference(**{"reference": f"Patient/{patient.id}"}) + }) + + def create_body_structure(self, _row, patient: Patient) -> BodyStructure: + body_structure_value = _row.get("Tissue or Organ of Origin") + included_structure = [] + if body_structure_value: + included_structure = [BodyStructureIncludedStructure(**{"structure": CodeableConcept(**{"coding": [ + {"code": body_structure_value, "system": self.SYSTEM_HTAN, "display": body_structure_value}]})})] + body_struct_ident = Identifier( + **{"system": self.SYSTEM_HTAN, "use": "official", "value": body_structure_value}) + return BodyStructure( + **{"id": utils.mint_id(identifier=[patient.identifier[0].value, body_struct_ident], + resource_type="BodyStructure", + project_id=self.project_id, + namespace=self.NAMESPACE_HTAN), + "identifier": [body_struct_ident], + "includedStructure": included_structure, + "patient": Reference(**{"reference": f"Patient/{patient.id}"}) + }) + + def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encounter, + body_structure: Optional[BodyStructure]) -> Optional[Condition]: + primary_diagnosis = _row.get("Primary Diagnosis") + if pd.isnull(primary_diagnosis): + return None + + # identifier string = project / patient / primary diagnosis + condition_identifier = Identifier(**{"system": self.SYSTEM_HTAN, + "use": "official", + "value": "/".join([self.subprogram_name, patient.id, + primary_diagnosis])}) + condition_id = self.mint_id(identifier=condition_identifier, resource_type="ResearchSubject", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + + onset_age = None + primary_diagnosis_age = self.get_field_value(_row=_row, mapping_type="case", fhir_field="Condition.onsetAge") + + primary_diagnosis_age_value = None + if not np.isnan(primary_diagnosis_age["htan_field_value"]): + primary_diagnosis_age_value = int(primary_diagnosis_age["htan_field_value"]) + + if primary_diagnosis_age_value: + onset_age = Age(**{"value": primary_diagnosis_age_value, + "unit": "years", + "system": "http://unitsofmeasure.org", + "code": "a" + }) + + recorded_date_field_value = self.get_field_value(_row=_row, mapping_type="case", + fhir_field="Condition.recordedDate") + recorded_date = None + if not np.isnan(recorded_date_field_value["htan_field_value"]): + recorded_date = datetime(int(recorded_date_field_value["htan_field_value"]), 1, 1) + + body_structure = self.create_body_structure(_row, patient) + patient_body_structure_ref = Reference(**{"reference": f"BodyStructure/{body_structure.id}"}) if body_structure.includedStructure else None + + patient_body_site_cc = [] + patient_body_site = self.get_field_value(_row=_row, mapping_type="case", fhir_field="Condition.bodySite")["htan_field_value"] + + if patient_body_site: + patient_body_site_cc = [CodeableConcept(**{"coding": [{"code": patient_body_site, + "system": self.SYSTEM_HTAN, + "display": patient_body_site}]})] + + return Condition(**{"id": condition_id, + "identifier": [condition_identifier], + "code": CodeableConcept(**{"coding": [{"code": primary_diagnosis, + "system": self.SYSTEM_HTAN, + "display": primary_diagnosis}]}), + "subject": Reference(**{"reference": f"Patient/{patient.id}"}), + "clinicalStatus": CodeableConcept(**{"coding": [{"code": "active", + "system": "http://terminology.hl7.org/CodeSystem/condition-clinical" , + "display": "Active"}]}), + "onsetAge": onset_age, + "recordedDate": recorded_date, + "bodySite": patient_body_site_cc, + # "bodyStructure": patient_body_structure_ref, + "encounter": Reference(**{"reference": f"Encounter/{encounter.id}"}), + "stage": [], + }) + + # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) # 12/14 total Atlas atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"] for name in atlas_name: transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) - patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) + patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", + verbose=False) patient_demographics_df = transformer.patient_demographics cases = transformer.cases @@ -343,6 +483,8 @@ def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> Rese patients = [] research_studies = [] research_subjects = [] + conditions = [] + encounters = [] for index, row in cases.iterrows(): research_study = patient_transformer.create_researchstudy(_row=row) @@ -362,10 +504,23 @@ def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> Rese if research_subject: research_subjects.append(research_subject) + encounter = patient_transformer.create_encounter(_row=row, patient=patient, condition=None, + procedure=None) + if encounter: + encounters.append(encounter) + condition = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter, + body_structure=None) + + if condition: + conditions.append(condition) + transformer.write_ndjson(research_subjects) transformer.write_ndjson(research_studies) transformer.write_ndjson(patients) + transformer.write_ndjson(encounters) + transformer.write_ndjson(conditions) # participant ids from specimen identifiers # transformer.decipher_htan_id(transformer.biospecimens["HTAN Biospecimen ID"][0]) - # transformer.decipher_htan_id(transformer.cases["HTAN Participant ID"][0]) \ No newline at end of file + # transformer.decipher_htan_id(transformer.cases["HTAN Participant ID"][0]) + From c58f467798a20a246e3f2a0cf3c7bd97e7656bbc Mon Sep 17 00:00:00 2001 From: teslajoy Date: Mon, 7 Oct 2024 07:06:50 -0700 Subject: [PATCH 09/24] condition observations + research study partOf --- fhirizer/htan2fhir.py | 108 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 101 insertions(+), 7 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index 71c8ead..c6f923e 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -60,6 +60,14 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): self.read_json = utils._read_json self.fhir_ndjson = utils.fhir_ndjson + parent_researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": "HTAN"}) + parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier, resource_type="ResearchStudy", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + self.program_research_study = ResearchStudy(**{"id": parent_researchstudy_id, + "identifier": [parent_researchstudy_identifier], + "name": "HTAN", + "status": "open"}) + self.project_path = str( Path(importlib.resources.files('fhirizer').parent / 'projects' / 'HTAN' / subprogram_name)) assert Path(self.project_path).is_dir(), f"Path {self.project_path} is not a valid directory path." @@ -118,6 +126,7 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): self.patient_demographics = self.get_patient_demographics() + def get_cases_mappings(self) -> dict: """HTAN cases FHIR mapping""" return self.read_json(self.cases_path) @@ -308,6 +317,7 @@ def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation: _component = self.get_component(key=key, value=value, component_type=self.get_data_types(type(value).__name__), system=self.SYSTEM_HTAN) + components.append(_component) observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.id}) @@ -351,10 +361,13 @@ def create_researchstudy(self, _row: pd.Series) -> ResearchStudy: researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": study_name}) researchstudy_id = self.mint_id(identifier=researchstudy_identifier, resource_type="ResearchStudy", project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + + # TODO: add "condition" snomed id return ResearchStudy(**{"id": researchstudy_id, "identifier": [researchstudy_identifier], "name": study_name, - "status": "open"}) # TODO: add "condition" snomed id + "status": "open", + "partOf": [Reference(**{"reference": f"ResearchStudy/{self.program_research_study.id}"})]}) def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> ResearchSubject: researchsubject_identifier = Identifier( @@ -377,7 +390,7 @@ def create_encounter(self, _row: pd.Series, patient: Patient, condition: Optiona condition_procedure = procedure.id encounter_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", - "value": "/".join([self.subprogram_name, patient.identifier[0].value])}) + "value": "-".join([self.subprogram_name, patient.identifier[0].value])}) encounter_id = self.mint_id(identifier=encounter_identifier, resource_type="Encounter", project_id=self.project_id, namespace=self.NAMESPACE_HTAN) @@ -414,7 +427,7 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount # identifier string = project / patient / primary diagnosis condition_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", - "value": "/".join([self.subprogram_name, patient.id, + "value": "-".join([self.subprogram_name, patient.id, primary_diagnosis])}) condition_id = self.mint_id(identifier=condition_identifier, resource_type="ResearchSubject", project_id=self.project_id, namespace=self.NAMESPACE_HTAN) @@ -447,8 +460,8 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount if patient_body_site: patient_body_site_cc = [CodeableConcept(**{"coding": [{"code": patient_body_site, - "system": self.SYSTEM_HTAN, - "display": patient_body_site}]})] + "system": self.SYSTEM_HTAN, + "display": patient_body_site}]})] return Condition(**{"id": condition_id, "identifier": [condition_identifier], @@ -467,6 +480,78 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount "stage": [], }) + def create_observation(self, _row: pd.Series, patient: Patient, + specimen: Optional[Specimen], official_focus: str, + focus: List[Reference], components: Optional[List], category: Optional[dict]) -> Observation: + assert focus, f"Observation for patient {patient.id} is missing focus." + + if not category: + category = [ + { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "code": "exam", + "display": "exam" + } + ], + "text": "Exam" + } + ] + + observation_fields = [] + for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(transformer.cases_mappings(), + "Observation.component"): + if _focus == official_focus: + observation_fields.append(_field) + + _obervation_row = _row[observation_fields] if observation_fields else None + + if _obervation_row is not None: + components = [] + for key, value in _obervation_row.to_dict().items(): + if key != 'HTAN Participant ID': + try: + if not pd.isnull(value): + if not isinstance(value, str) and value.is_integer(): + value = int(value) + _component = self.get_component(key=key, value=value, + component_type=self.get_data_types(type(value).__name__), + system=self.SYSTEM_HTAN) + components.append(_component) + except (ValueError, TypeError): + if self.verbose: + print(f"Components {key}: {value} can't be added to list - value/type error.") + + focus_ids = [r.reference.split("/")[1] for r in focus] + observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN, + "use": "official", + "value": "-".join([patient.identifier[0].value] + focus_ids)}) + observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + specimen_ref = None + if specimen: + specimen_ref = Reference(**{"reference": f"Specimen/{specimen.id}"}) + # add valueCodeableConcept as needed after creation + return Observation(**{"id": observation_id, + "identifier": [observation_identifier], + "status": "final", + "category": category, + "code": { + "coding": [ + { + "system": self.SYSTEM_LOINC, + "code": "75323-6", # TODO: place-holder + "display": "Condition" + } + ], + "text": "Condition" + }, + "focus": focus, + "subject": Reference(**{"reference": f"Patient/{patient.id}"}), + "component": components, + "specimen": specimen_ref}) + # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) # 12/14 total Atlas @@ -485,20 +570,24 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount research_subjects = [] conditions = [] encounters = [] + observations = [] for index, row in cases.iterrows(): research_study = patient_transformer.create_researchstudy(_row=row) if research_study: + research_studies.append(transformer.program_research_study) research_studies.append(research_study) patient_row = cases.iloc[index][patient_demographics_df.columns] patient = patient_transformer.create_patient(_row=patient_row) patient_obs = patient_transformer.patient_observation(patient=patient, _row=row) + if patient_obs: + observations.append(patient_obs) if patient: patients.append(patient) - print(f"HTAN FHIR Patient: {patient.json()}") - print(f"HTAN FHIR Patient Observation: {patient_obs.json()}") + # print(f"HTAN FHIR Patient: {patient.json()}") + # print(f"HTAN FHIR Patient Observation: {patient_obs.json()}") research_subject = patient_transformer.create_researchsubject(patient, research_study) if research_subject: @@ -514,11 +603,16 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount if condition: conditions.append(condition) + condition_observation = patient_transformer.create_observation(_row=row, patient=patient, official_focus="Condition", focus=[Reference(**{"reference": f"Condition/{condition.id}"})], specimen=None, components=None, category=None) + if condition_observation: + observations.append(condition_observation) + transformer.write_ndjson(research_subjects) transformer.write_ndjson(research_studies) transformer.write_ndjson(patients) transformer.write_ndjson(encounters) transformer.write_ndjson(conditions) + transformer.write_ndjson(observations) # participant ids from specimen identifiers # transformer.decipher_htan_id(transformer.biospecimens["HTAN Biospecimen ID"][0]) From 2e5c4f50f78a45fed83ac4ec1d0fc65b2f666075 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Mon, 7 Oct 2024 07:52:48 -0700 Subject: [PATCH 10/24] initial specimen --- fhirizer/htan2fhir.py | 107 +++++++++++++++++++++++++++++++++++------- 1 file changed, 90 insertions(+), 17 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index c6f923e..695bde3 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -61,8 +61,9 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): self.fhir_ndjson = utils.fhir_ndjson parent_researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": "HTAN"}) - parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier, resource_type="ResearchStudy", - project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier, + resource_type="ResearchStudy", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) self.program_research_study = ResearchStudy(**{"id": parent_researchstudy_id, "identifier": [parent_researchstudy_identifier], "name": "HTAN", @@ -126,7 +127,6 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): self.patient_demographics = self.get_patient_demographics() - def get_cases_mappings(self) -> dict: """HTAN cases FHIR mapping""" return self.read_json(self.cases_path) @@ -367,7 +367,8 @@ def create_researchstudy(self, _row: pd.Series) -> ResearchStudy: "identifier": [researchstudy_identifier], "name": study_name, "status": "open", - "partOf": [Reference(**{"reference": f"ResearchStudy/{self.program_research_study.id}"})]}) + "partOf": [ + Reference(**{"reference": f"ResearchStudy/{self.program_research_study.id}"})]}) def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> ResearchSubject: researchsubject_identifier = Identifier( @@ -453,10 +454,12 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount recorded_date = datetime(int(recorded_date_field_value["htan_field_value"]), 1, 1) body_structure = self.create_body_structure(_row, patient) - patient_body_structure_ref = Reference(**{"reference": f"BodyStructure/{body_structure.id}"}) if body_structure.includedStructure else None + patient_body_structure_ref = Reference( + **{"reference": f"BodyStructure/{body_structure.id}"}) if body_structure.includedStructure else None patient_body_site_cc = [] - patient_body_site = self.get_field_value(_row=_row, mapping_type="case", fhir_field="Condition.bodySite")["htan_field_value"] + patient_body_site = self.get_field_value(_row=_row, mapping_type="case", fhir_field="Condition.bodySite")[ + "htan_field_value"] if patient_body_site: patient_body_site_cc = [CodeableConcept(**{"coding": [{"code": patient_body_site, @@ -470,7 +473,7 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount "display": primary_diagnosis}]}), "subject": Reference(**{"reference": f"Patient/{patient.id}"}), "clinicalStatus": CodeableConcept(**{"coding": [{"code": "active", - "system": "http://terminology.hl7.org/CodeSystem/condition-clinical" , + "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "display": "Active"}]}), "onsetAge": onset_age, "recordedDate": recorded_date, @@ -497,11 +500,11 @@ def create_observation(self, _row: pd.Series, patient: Patient, ], "text": "Exam" } - ] + ] observation_fields = [] for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(transformer.cases_mappings(), - "Observation.component"): + "Observation.component"): if _focus == official_focus: observation_fields.append(_field) @@ -516,8 +519,8 @@ def create_observation(self, _row: pd.Series, patient: Patient, if not isinstance(value, str) and value.is_integer(): value = int(value) _component = self.get_component(key=key, value=value, - component_type=self.get_data_types(type(value).__name__), - system=self.SYSTEM_HTAN) + component_type=self.get_data_types(type(value).__name__), + system=self.SYSTEM_HTAN) components.append(_component) except (ValueError, TypeError): if self.verbose: @@ -541,7 +544,7 @@ def create_observation(self, _row: pd.Series, patient: Patient, "coding": [ { "system": self.SYSTEM_LOINC, - "code": "75323-6", # TODO: place-holder + "code": "75323-6", # TODO: place-holder "display": "Condition" } ], @@ -552,18 +555,76 @@ def create_observation(self, _row: pd.Series, patient: Patient, "component": components, "specimen": specimen_ref}) + def create_medication_administration(self) -> MedicationAdministration: + return MedicationAdministration(**{}) + + +class SpecimenTransformer(HTANTransformer): + def __init__(self, *args: Any, **kwargs: Any): + super().__init__(**kwargs) + self.cases_mapping = self.cases_mappings + self.NAMESPACE_HTAN = self.NAMESPACE_HTAN + self.get_data_types = utils.get_data_types + self.get_component = self.get_component + self.get_fields_by_fhir_map = self.get_fields_by_fhir_map + + def create_specimen(self, _row: pd.Series) -> Specimen: + """Transform HTAN biospecimen to FHIR Specimen""" + + specimen_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "value": _row['HTAN Biospecimen ID'], "use": "official"}) + specimen_id = self.mint_id(identifier=specimen_identifier, resource_type="Specimen", project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + + # participant id from specimen identifier + participant_id = self.decipher_htan_id(_row["HTAN Biospecimen ID"])["participant_id"] + assert participant_id, f"Specimen {_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it." + + patient_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "value": participant_id, "use": "official"}) + patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + subject = Reference(**{"reference": f"Patient/{patient_id}"}) # Check if Group exists + + parent_specimen_reference = [] + if not pd.isnull(_row["HTAN Parent ID"]): + parent_specimen_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "value": _row['HTAN Biospecimen ID'], "use": "official"}) + parent_specimen_id = self.mint_id(identifier=parent_specimen_identifier, resource_type="Specimen", project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + parent_specimen_reference.append(Reference(**{"reference": f"Specimen/{parent_specimen_id}"})) + + specimen_fields = [] + for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(self.cases_mapping(), + "Specimen"): + specimen_fields.append(_field) + + return Specimen(**{"id": specimen_id, + "identifier": [specimen_identifier], + "type": CodeableConcept(**{"coding": [ + {"code": _row["Biospecimen Type"], "system": self.SYSTEM_HTAN, + "display": _row["Biospecimen Type"]}]}), + "processing": [SpecimenProcessing(**{"method": CodeableConcept(**{"coding": [ + {"code": _row["Preservation Method"], "system": self.SYSTEM_HTAN, + "display": _row["Preservation Method"]}]})})], + "parent": parent_specimen_reference, + "subject": subject}) + # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) # 12/14 total Atlas -atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"] +# atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"] +atlas_name = ["OHSU"] for name in atlas_name: transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) + specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) patient_demographics_df = transformer.patient_demographics cases = transformer.cases + htan_biospecimens = transformer.biospecimens patients = [] research_studies = [] @@ -603,18 +664,30 @@ def create_observation(self, _row: pd.Series, patient: Patient, if condition: conditions.append(condition) - condition_observation = patient_transformer.create_observation(_row=row, patient=patient, official_focus="Condition", focus=[Reference(**{"reference": f"Condition/{condition.id}"})], specimen=None, components=None, category=None) + condition_observation = patient_transformer.create_observation(_row=row, patient=patient, + official_focus="Condition", + focus=[Reference(**{ + "reference": f"Condition/{condition.id}"})], + specimen=None, components=None, + category=None) if condition_observation: observations.append(condition_observation) + specimens = [] + for index, row in htan_biospecimens.iterrows(): + specimen_row = htan_biospecimens.iloc[index] + specimen = specimen_transformer.create_specimen(_row=specimen_row) + if specimen: + specimens.append(specimen) + transformer.write_ndjson(research_subjects) transformer.write_ndjson(research_studies) transformer.write_ndjson(patients) transformer.write_ndjson(encounters) transformer.write_ndjson(conditions) transformer.write_ndjson(observations) + transformer.write_ndjson(specimens) # participant ids from specimen identifiers - # transformer.decipher_htan_id(transformer.biospecimens["HTAN Biospecimen ID"][0]) - # transformer.decipher_htan_id(transformer.cases["HTAN Participant ID"][0]) - + # print(transformer.decipher_htan_id(htan_biospecimens["HTAN Biospecimen ID"][0])) + # print(transformer.decipher_htan_id(cases["HTAN Participant ID"][0])) From 4815b352683d64697eca34b84eed4d851e3208d9 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Mon, 7 Oct 2024 08:56:43 -0700 Subject: [PATCH 11/24] specimen observations --- fhirizer/htan2fhir.py | 229 ++++++++++++++++++++++++++---------------- 1 file changed, 144 insertions(+), 85 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index 695bde3..c21fb1f 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -244,6 +244,102 @@ def decipher_htan_id(_id) -> dict: deciphered_id = {"participant_id": participant_id, "subsets": _id_substrings} return deciphered_id + def create_observation(self, _row: pd.Series, patient: Optional[Patient], patient_id: str, + specimen: Optional[Specimen], official_focus: str, + focus: List[Reference], components: Optional[List], category: Optional[list]) -> Observation: + assert patient_id, f"Observation is missing patient id: {patient_id}." + assert focus, f"Observation for patient {patient_id} is missing focus." + + if not category: + category = [ + { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "code": "exam", + "display": "exam" + } + ], + "text": "Exam" + } + ] + + observation_fields = [] + + if official_focus not in ["Specimen"]: + mappings = transformer.cases_mappings() + code = { + "coding": [ + { + "system": self.SYSTEM_LOINC, + "code": "68992-7", + "display": "Specimen-related information panel" + } + ], + "text": "Specimen-related information panel" + } + else: + mappings = transformer.biospecimen_mappings() + code = { + "coding": [ + { + "system": "http://loinc.org", + "code": "75323-6", + "display": "Condition" + } + ], + "text": "Condition" + } + + for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(mappings, + "Observation.component"): + if _focus == official_focus: + observation_fields.append(_field) + + _obervation_row = _row[observation_fields] if observation_fields else None + + if _obervation_row is not None: + components = [] + for key, value in _obervation_row.to_dict().items(): + if key != 'HTAN Participant ID': + try: + if not pd.isnull(value): + if not isinstance(value, str) and value.is_integer(): + value = int(value) + _component = self.get_component(key=key, value=value, + component_type=self.get_data_types(type(value).__name__), + system=self.SYSTEM_HTAN) + components.append(_component) + except (ValueError, TypeError): + if self.verbose: + print(f"Components {key}: {value} can't be added to list - value/type error.") + + focus_ids = [r.reference.split("/")[1] for r in focus] + + if patient: + identifier_value = "-".join([patient.identifier[0].value] + focus_ids) + else: + identifier_value = "-".join(focus_ids) + + observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN, + "use": "official", + "value": identifier_value}) + observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + specimen_ref = None + if specimen: + specimen_ref = Reference(**{"reference": f"Specimen/{specimen.id}"}) + # add valueCodeableConcept as needed after creation + return Observation(**{"id": observation_id, + "identifier": [observation_identifier], + "status": "final", + "category": category, + "code": code, + "focus": focus, + "subject": Reference(**{"reference": f"Patient/{patient_id}"}), + "component": components, + "specimen": specimen_ref}) + def write_ndjson(self, entities): resource_type = entities[0].resource_type entities = [orjson.loads(entity.json()) for entity in entities] @@ -259,6 +355,7 @@ def __init__(self, *args: Any, **kwargs: Any): self.get_data_types = utils.get_data_types self.get_component = self.get_component self.get_fields_by_fhir_map = self.get_fields_by_fhir_map + self.create_observation = self.create_observation def create_patient(self, _row: pd.Series) -> Patient: """Transform HTAN case demographics to FHIR Patient""" @@ -483,78 +580,6 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount "stage": [], }) - def create_observation(self, _row: pd.Series, patient: Patient, - specimen: Optional[Specimen], official_focus: str, - focus: List[Reference], components: Optional[List], category: Optional[dict]) -> Observation: - assert focus, f"Observation for patient {patient.id} is missing focus." - - if not category: - category = [ - { - "coding": [ - { - "system": "http://terminology.hl7.org/CodeSystem/observation-category", - "code": "exam", - "display": "exam" - } - ], - "text": "Exam" - } - ] - - observation_fields = [] - for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(transformer.cases_mappings(), - "Observation.component"): - if _focus == official_focus: - observation_fields.append(_field) - - _obervation_row = _row[observation_fields] if observation_fields else None - - if _obervation_row is not None: - components = [] - for key, value in _obervation_row.to_dict().items(): - if key != 'HTAN Participant ID': - try: - if not pd.isnull(value): - if not isinstance(value, str) and value.is_integer(): - value = int(value) - _component = self.get_component(key=key, value=value, - component_type=self.get_data_types(type(value).__name__), - system=self.SYSTEM_HTAN) - components.append(_component) - except (ValueError, TypeError): - if self.verbose: - print(f"Components {key}: {value} can't be added to list - value/type error.") - - focus_ids = [r.reference.split("/")[1] for r in focus] - observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN, - "use": "official", - "value": "-".join([patient.identifier[0].value] + focus_ids)}) - observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation", - project_id=self.project_id, namespace=self.NAMESPACE_HTAN) - specimen_ref = None - if specimen: - specimen_ref = Reference(**{"reference": f"Specimen/{specimen.id}"}) - # add valueCodeableConcept as needed after creation - return Observation(**{"id": observation_id, - "identifier": [observation_identifier], - "status": "final", - "category": category, - "code": { - "coding": [ - { - "system": self.SYSTEM_LOINC, - "code": "75323-6", # TODO: place-holder - "display": "Condition" - } - ], - "text": "Condition" - }, - "focus": focus, - "subject": Reference(**{"reference": f"Patient/{patient.id}"}), - "component": components, - "specimen": specimen_ref}) - def create_medication_administration(self) -> MedicationAdministration: return MedicationAdministration(**{}) @@ -567,6 +592,7 @@ def __init__(self, *args: Any, **kwargs: Any): self.get_data_types = utils.get_data_types self.get_component = self.get_component self.get_fields_by_fhir_map = self.get_fields_by_fhir_map + self.create_observation = self.create_observation def create_specimen(self, _row: pd.Series) -> Specimen: """Transform HTAN biospecimen to FHIR Specimen""" @@ -580,18 +606,16 @@ def create_specimen(self, _row: pd.Series) -> Specimen: participant_id = self.decipher_htan_id(_row["HTAN Biospecimen ID"])["participant_id"] assert participant_id, f"Specimen {_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it." - patient_identifier = Identifier( - **{"system": self.SYSTEM_HTAN, "value": participant_id, "use": "official"}) - patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id, - namespace=self.NAMESPACE_HTAN) - subject = Reference(**{"reference": f"Patient/{patient_id}"}) # Check if Group exists + patient_id = self.get_specimen_patient(_row=_row) + subject = Reference(**{"reference": f"Patient/{patient_id}"}) # Check if Group exists parent_specimen_reference = [] if not pd.isnull(_row["HTAN Parent ID"]): parent_specimen_identifier = Identifier( **{"system": self.SYSTEM_HTAN, "value": _row['HTAN Biospecimen ID'], "use": "official"}) - parent_specimen_id = self.mint_id(identifier=parent_specimen_identifier, resource_type="Specimen", project_id=self.project_id, - namespace=self.NAMESPACE_HTAN) + parent_specimen_id = self.mint_id(identifier=parent_specimen_identifier, resource_type="Specimen", + project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) parent_specimen_reference.append(Reference(**{"reference": f"Specimen/{parent_specimen_id}"})) specimen_fields = [] @@ -610,17 +634,27 @@ def create_specimen(self, _row: pd.Series) -> Specimen: "parent": parent_specimen_reference, "subject": subject}) + def get_specimen_patient(self, _row) -> str: + participant_id = self.decipher_htan_id(_row["HTAN Biospecimen ID"])["participant_id"] + assert participant_id, f"Specimen {_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it." + + patient_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "value": participant_id, "use": "official"}) + patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + return patient_id + # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) # 12/14 total Atlas -# atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"] -atlas_name = ["OHSU"] +atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", + "Vanderbilt"] for name in atlas_name: transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) - specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) + specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", + verbose=False) patient_demographics_df = transformer.patient_demographics cases = transformer.cases @@ -633,7 +667,6 @@ def create_specimen(self, _row: pd.Series) -> Specimen: encounters = [] observations = [] for index, row in cases.iterrows(): - research_study = patient_transformer.create_researchstudy(_row=row) if research_study: @@ -665,6 +698,7 @@ def create_specimen(self, _row: pd.Series) -> Specimen: conditions.append(condition) condition_observation = patient_transformer.create_observation(_row=row, patient=patient, + patient_id=patient.id, official_focus="Condition", focus=[Reference(**{ "reference": f"Condition/{condition.id}"})], @@ -674,12 +708,37 @@ def create_specimen(self, _row: pd.Series) -> Specimen: observations.append(condition_observation) specimens = [] - for index, row in htan_biospecimens.iterrows(): - specimen_row = htan_biospecimens.iloc[index] + for specimen_index, specimen_row in htan_biospecimens.iterrows(): + # specimen_row = htan_biospecimens.iloc[specimen_index] specimen = specimen_transformer.create_specimen(_row=specimen_row) if specimen: specimens.append(specimen) + specimen_observation_category = [ + { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "code": "laboratory", + "display": "laboratory" + } + ], + "text": "Laboratory" + } + ] + + specimen_participant_id = specimen_transformer.get_specimen_patient(_row=specimen_row) + specimen_observation = specimen_transformer.create_observation(_row=specimen_row, patient=None, + official_focus="Specimen", + focus=[Reference(**{ + "reference": f"Specimen/{specimen.id}"})], + patient_id=specimen_participant_id, + specimen=specimen, components=None, + category=specimen_observation_category) + # print(specimen_observation.component) + if specimen_observation: + observations.append(specimen_observation) + transformer.write_ndjson(research_subjects) transformer.write_ndjson(research_studies) transformer.write_ndjson(patients) From a70f3489f702754fb4d614142d21e818241ed955 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Mon, 7 Oct 2024 11:57:34 -0700 Subject: [PATCH 12/24] document reference - in progress --- fhirizer/htan2fhir.py | 145 +++++++++++++++++++++++++--- resources/htan_resources/files.json | 8 +- 2 files changed, 135 insertions(+), 18 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index c21fb1f..8ab7860 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -3,6 +3,7 @@ import numpy as np import orjson +import mimetypes import copy import glob import pathlib @@ -31,7 +32,7 @@ from fhir.resources.specimen import Specimen, SpecimenProcessing, SpecimenCollection from fhir.resources.condition import Condition, ConditionStage from fhir.resources.documentreference import DocumentReference, DocumentReferenceContent, \ - DocumentReferenceContentProfile + DocumentReferenceContentProfile, DocumentReferenceRelatesTo from fhir.resources.attachment import Attachment from fhir.resources.medicationadministration import MedicationAdministration from fhir.resources.medication import Medication @@ -127,6 +128,15 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): self.patient_demographics = self.get_patient_demographics() + # combine and create standard fhir files metadata + # print(self.files["Filename"].str.split('/')[1]) + self.files = self.files[self.files["Filename"].str.contains('.')] # NOTE: HTAPP contains file names ex. HTA1_982_7629309080080, that do not have any metadata + self.files = self.files[self.files["Filename"].str.contains('/')] + + self.files['mime_type'] = self.files["Filename"].apply(lambda x: mimetypes.guess_type(x)[0]) + self.files['name'] = self.files["Filename"].str.split('/').apply(lambda x: x[1]) + self.files_drs_meta = self.files.merge(self.files_drs_uri, how="left", on="name") + def get_cases_mappings(self) -> dict: """HTAN cases FHIR mapping""" return self.read_json(self.cases_path) @@ -340,6 +350,12 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien "component": components, "specimen": specimen_ref}) + def get_patient_id(self, participant_id) -> str: + patient_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "value": participant_id, "use": "official"}) + patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + return patient_id + def write_ndjson(self, entities): resource_type = entities[0].resource_type entities = [orjson.loads(entity.json()) for entity in entities] @@ -593,6 +609,7 @@ def __init__(self, *args: Any, **kwargs: Any): self.get_component = self.get_component self.get_fields_by_fhir_map = self.get_fields_by_fhir_map self.create_observation = self.create_observation + self.get_patient_id = self.get_patient_id def create_specimen(self, _row: pd.Series) -> Specimen: """Transform HTAN biospecimen to FHIR Specimen""" @@ -606,7 +623,7 @@ def create_specimen(self, _row: pd.Series) -> Specimen: participant_id = self.decipher_htan_id(_row["HTAN Biospecimen ID"])["participant_id"] assert participant_id, f"Specimen {_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it." - patient_id = self.get_specimen_patient(_row=_row) + patient_id = self.get_patient_id(participant_id=participant_id) subject = Reference(**{"reference": f"Patient/{patient_id}"}) # Check if Group exists parent_specimen_reference = [] @@ -634,31 +651,121 @@ def create_specimen(self, _row: pd.Series) -> Specimen: "parent": parent_specimen_reference, "subject": subject}) - def get_specimen_patient(self, _row) -> str: - participant_id = self.decipher_htan_id(_row["HTAN Biospecimen ID"])["participant_id"] - assert participant_id, f"Specimen {_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it." - patient_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "value": participant_id, "use": "official"}) - patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id, - namespace=self.NAMESPACE_HTAN) - return patient_id +class DocumentReferenceTransformer(HTANTransformer): + def __init__(self, *args: Any, **kwargs: Any): + super().__init__(**kwargs) + self.cases_mapping = self.cases_mappings + self.NAMESPACE_HTAN = self.NAMESPACE_HTAN + self.get_data_types = utils.get_data_types + self.get_component = self.get_component + self.get_fields_by_fhir_map = self.get_fields_by_fhir_map + self.create_observation = self.create_observation + self.get_patient_id = self.get_patient_id + + def create_document_reference(self, _row: pd.Series) -> DocumentReference: + """Transform HTAN files to FHIR DocumentReference""" + + document_reference_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "value": _row['HTAN Data File ID'], "use": "official"}) + + document_reference_synapse_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "value": _row['Synapse Id'], "use": "secondary"}) + + document_reference_id = self.mint_id(identifier=document_reference_identifier, + resource_type="DocumentReference", project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + + # participant id + patient_id = None + if "HTAN Participant ID" in _row.keys() and not pd.isnull(_row["HTAN Participant ID"]): + participant_id = _row["HTAN Participant ID"] + assert participant_id, f"DocumentRefernce {_row["HTAN Data File ID"]} does not have a patient participant associated with it." + patient_id = self.get_patient_id(participant_id=participant_id) + + name = None + if _row["Filename"]: + name = _row["Filename"] + + profiles = [] + if not pd.isnull(_row['drs_uri']): + uri_profile = DocumentReferenceContentProfile(**{"valueUri": _row['drs_uri']}) + profiles.append(uri_profile) + + category = [] + if not pd.isnull(_row['Assay']): + category.append(CodeableConcept(**{"coding": [{"code": _row['Assay'], "display": _row['Assay'], "system": "/".join([self.SYSTEM_HTAN, "Assay"])}]})) + if not pd.isnull(_row['Level']): + category.append(CodeableConcept(**{"coding": [{"code": _row['Level'], "display": _row['Level'], "system": "/".join([self.SYSTEM_HTAN, "Level"])}]})) + + subject = None + if patient_id: + Reference(**{"reference": f"Patient/{patient_id}"}) + + based_on = [] + if not pd.isnull(_row['Biospecimen']): + specimen_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "value": _row['Biospecimen'], "use": "official"}) + specimen_id = self.mint_id(identifier=specimen_identifier, resource_type="Specimen", + project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + based_on.append(Reference(**{"reference": f"Specimen/{specimen_id}"})) + + security_label = [] + if not pd.isnull(_row['Data Access']): + security_label.append(CodeableConcept(**{"coding": [{"code":_row['Data Access'], "display": _row['Data Access'], "system": "/".join([self.SYSTEM_HTAN, "Data_Access"])}]})) + + parent_data_file = [] + if not pd.isnull(_row["Parent Data File ID"]): + parent_document_reference_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "value": _row["Parent Data File ID"], "use": "official"}) + + parent_document_reference_id = self.mint_id(identifier=parent_document_reference_identifier, + resource_type="DocumentReference", project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + + parent_data_file.append(DocumentReferenceRelatesTo(**{ + "code": CodeableConcept(**{"coding": [{"code": "parent_data_file", + "system": "/".join([self.SYSTEM_HTAN, "Parent_Data_File_ID"]), + "display": "parent_data_file"}]}), + "target": Reference(**{"reference": f"Documentreference/{parent_document_reference_id}"})})) + + return DocumentReference(**{"id": document_reference_id, + "identifier": [document_reference_identifier, document_reference_synapse_identifier], + "status": "current", + "docStatus": "final", + # "basedOn": based_on, # TODO: requires check for specimen - missing data + "subject": subject, + # "relatesTo": parent_data_file, # TODO: requires check for file - missing data + "category": category, + "securityLabel": security_label, + "content": [DocumentReferenceContent( + **{"attachment": Attachment(**{"title": name, "contentType": _row["mime_type"]}), + "profile": profiles + })] + }) # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) # 12/14 total Atlas -atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", - "Vanderbilt"] -for name in atlas_name: +atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"] +for name in atlas_name: + # print(name) transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) + documentreference_transformer = DocumentReferenceTransformer(subprogram_name=name, + out_dir=f"./projects/HTAN/{name}/META", + verbose=False) patient_demographics_df = transformer.patient_demographics cases = transformer.cases htan_biospecimens = transformer.biospecimens + files = transformer.files + files_drs_meta = transformer.files_drs_meta patients = [] research_studies = [] @@ -727,7 +834,11 @@ def get_specimen_patient(self, _row) -> str: } ] - specimen_participant_id = specimen_transformer.get_specimen_patient(_row=specimen_row) + participant_id = specimen_transformer.decipher_htan_id(specimen_row["HTAN Biospecimen ID"])[ + "participant_id"] + assert participant_id, f"Specimen {specimen_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it." + + specimen_participant_id = specimen_transformer.get_patient_id(participant_id=participant_id) specimen_observation = specimen_transformer.create_observation(_row=specimen_row, patient=None, official_focus="Specimen", focus=[Reference(**{ @@ -735,10 +846,15 @@ def get_specimen_patient(self, _row) -> str: patient_id=specimen_participant_id, specimen=specimen, components=None, category=specimen_observation_category) - # print(specimen_observation.component) if specimen_observation: observations.append(specimen_observation) + document_references = [] + for document_reference_index, document_reference_row in files_drs_meta.iterrows(): + docref = documentreference_transformer.create_document_reference(_row=document_reference_row) + if docref: + document_references.append(docref) + transformer.write_ndjson(research_subjects) transformer.write_ndjson(research_studies) transformer.write_ndjson(patients) @@ -746,6 +862,7 @@ def get_specimen_patient(self, _row) -> str: transformer.write_ndjson(conditions) transformer.write_ndjson(observations) transformer.write_ndjson(specimens) + transformer.write_ndjson(document_references) # participant ids from specimen identifiers # print(transformer.decipher_htan_id(htan_biospecimens["HTAN Biospecimen ID"][0])) diff --git a/resources/htan_resources/files.json b/resources/htan_resources/files.json index 8e31dec..6baa511 100644 --- a/resources/htan_resources/files.json +++ b/resources/htan_resources/files.json @@ -29,8 +29,8 @@ ], "Level": [ { - "fhir_map": "Observation.component", - "focus": "DocumentReference" + "fhir_map": "DocumentReference.category", + "focus": null } ], "Organ": [ @@ -72,7 +72,7 @@ ], "HTAN Parent Biospecimen ID": [ { - "fhir_map": "Specimen.parent", + "fhir_map": "Specimen.identifier", "focus": null } ], @@ -352,7 +352,7 @@ ], "Parent Biospecimen ID": [ { - "fhir_map": "Specimen.parent", + "fhir_map": "Specimen.identifier", "focus": null } ], From 47b70e56b67d247ae27bd02b810b5d9fef225e39 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Mon, 7 Oct 2024 12:47:17 -0700 Subject: [PATCH 13/24] specimen refernece --- fhirizer/htan2fhir.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index 8ab7860..f1069ac 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -663,7 +663,7 @@ def __init__(self, *args: Any, **kwargs: Any): self.create_observation = self.create_observation self.get_patient_id = self.get_patient_id - def create_document_reference(self, _row: pd.Series) -> DocumentReference: + def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> DocumentReference: """Transform HTAN files to FHIR DocumentReference""" document_reference_identifier = Identifier( @@ -709,7 +709,8 @@ def create_document_reference(self, _row: pd.Series) -> DocumentReference: specimen_id = self.mint_id(identifier=specimen_identifier, resource_type="Specimen", project_id=self.project_id, namespace=self.NAMESPACE_HTAN) - based_on.append(Reference(**{"reference": f"Specimen/{specimen_id}"})) + if specimen_id in specimen_ids: + based_on.append(Reference(**{"reference": f"Specimen/{specimen_id}"})) security_label = [] if not pd.isnull(_row['Data Access']): @@ -734,7 +735,7 @@ def create_document_reference(self, _row: pd.Series) -> DocumentReference: "identifier": [document_reference_identifier, document_reference_synapse_identifier], "status": "current", "docStatus": "final", - # "basedOn": based_on, # TODO: requires check for specimen - missing data + "basedOn": based_on, "subject": subject, # "relatesTo": parent_data_file, # TODO: requires check for file - missing data "category": category, @@ -749,9 +750,10 @@ def create_document_reference(self, _row: pd.Series) -> DocumentReference: # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) # 12/14 total Atlas atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"] - +# atlas_name = ["OHSU"] for name in atlas_name: - # print(name) + print(f"Processing HTAN atlas {name}") + transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) @@ -849,9 +851,10 @@ def create_document_reference(self, _row: pd.Series) -> DocumentReference: if specimen_observation: observations.append(specimen_observation) + specimen_ids = [s.id for s in specimens] document_references = [] for document_reference_index, document_reference_row in files_drs_meta.iterrows(): - docref = documentreference_transformer.create_document_reference(_row=document_reference_row) + docref = documentreference_transformer.create_document_reference(_row=document_reference_row, specimen_ids=specimen_ids) if docref: document_references.append(docref) From de69ad166cbcb7a89dcba8ac0fffc39693d6a88b Mon Sep 17 00:00:00 2001 From: teslajoy Date: Mon, 7 Oct 2024 13:39:04 -0700 Subject: [PATCH 14/24] observations w focus document reference --- fhirizer/htan2fhir.py | 129 +++++++++++++++++++++++++++++------------- 1 file changed, 90 insertions(+), 39 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index f1069ac..1aed72d 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -60,6 +60,18 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): self.NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, self.SYSTEM_HTAN) self.read_json = utils._read_json self.fhir_ndjson = utils.fhir_ndjson + self.lab_category = [ + { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "code": "laboratory", + "display": "laboratory" + } + ], + "text": "Laboratory" + } + ] parent_researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": "HTAN"}) parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier, @@ -130,7 +142,8 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): # combine and create standard fhir files metadata # print(self.files["Filename"].str.split('/')[1]) - self.files = self.files[self.files["Filename"].str.contains('.')] # NOTE: HTAPP contains file names ex. HTA1_982_7629309080080, that do not have any metadata + self.files = self.files[self.files["Filename"].str.contains( + '.')] # NOTE: HTAPP contains file names ex. HTA1_982_7629309080080, that do not have any metadata self.files = self.files[self.files["Filename"].str.contains('/')] self.files['mime_type'] = self.files["Filename"].apply(lambda x: mimetypes.guess_type(x)[0]) @@ -254,10 +267,10 @@ def decipher_htan_id(_id) -> dict: deciphered_id = {"participant_id": participant_id, "subsets": _id_substrings} return deciphered_id - def create_observation(self, _row: pd.Series, patient: Optional[Patient], patient_id: str, + def create_observation(self, _row: pd.Series, patient: Optional[Patient], patient_id: Optional[str], specimen: Optional[Specimen], official_focus: str, - focus: List[Reference], components: Optional[List], category: Optional[list]) -> Observation: - assert patient_id, f"Observation is missing patient id: {patient_id}." + focus: List[Reference], components: Optional[List], category: Optional[list], relax : bool) -> Observation: + # assert patient_id, f"Observation is missing patient id: {patient_id}." # HTAN files doesn't always point to patient assert focus, f"Observation for patient {patient_id} is missing focus." if not category: @@ -276,29 +289,41 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien observation_fields = [] - if official_focus not in ["Specimen"]: + if official_focus in ["Patient", "Condition"]: mappings = transformer.cases_mappings() + code = { + "coding": [ + { + "system": "http://loinc.org", + "code": "75323-6", + "display": "Condition" + } + ], + "text": "Condition" + } + elif official_focus in ["DocumentReference"]: + mappings = transformer.files_mappings() code = { "coding": [ { "system": self.SYSTEM_LOINC, "code": "68992-7", - "display": "Specimen-related information panel" + "display": "Specimen-related information panel" #TODO: find general code } ], "text": "Specimen-related information panel" } - else: + elif official_focus in ["Specimen"]: mappings = transformer.biospecimen_mappings() code = { "coding": [ { - "system": "http://loinc.org", - "code": "75323-6", - "display": "Condition" + "system": self.SYSTEM_LOINC, + "code": "68992-7", + "display": "Specimen-related information panel" } ], - "text": "Condition" + "text": "Specimen-related information panel" } for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(mappings, @@ -306,7 +331,10 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien if _focus == official_focus: observation_fields.append(_field) - _obervation_row = _row[observation_fields] if observation_fields else None + if not relax: + _obervation_row = _row[observation_fields] if observation_fields else None + else: + _obervation_row = _row # user-specific columns in files - add all to component if _obervation_row is not None: components = [] @@ -339,6 +367,10 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien specimen_ref = None if specimen: specimen_ref = Reference(**{"reference": f"Specimen/{specimen.id}"}) + + subject = None + if patient_id: + subject = Reference(**{"reference": f"Patient/{patient_id}"}) # add valueCodeableConcept as needed after creation return Observation(**{"id": observation_id, "identifier": [observation_identifier], @@ -346,7 +378,7 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien "category": category, "code": code, "focus": focus, - "subject": Reference(**{"reference": f"Patient/{patient_id}"}), + "subject": subject, "component": components, "specimen": specimen_ref}) @@ -694,9 +726,11 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu category = [] if not pd.isnull(_row['Assay']): - category.append(CodeableConcept(**{"coding": [{"code": _row['Assay'], "display": _row['Assay'], "system": "/".join([self.SYSTEM_HTAN, "Assay"])}]})) + category.append(CodeableConcept(**{"coding": [ + {"code": _row['Assay'], "display": _row['Assay'], "system": "/".join([self.SYSTEM_HTAN, "Assay"])}]})) if not pd.isnull(_row['Level']): - category.append(CodeableConcept(**{"coding": [{"code": _row['Level'], "display": _row['Level'], "system": "/".join([self.SYSTEM_HTAN, "Level"])}]})) + category.append(CodeableConcept(**{"coding": [ + {"code": _row['Level'], "display": _row['Level'], "system": "/".join([self.SYSTEM_HTAN, "Level"])}]})) subject = None if patient_id: @@ -714,7 +748,9 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu security_label = [] if not pd.isnull(_row['Data Access']): - security_label.append(CodeableConcept(**{"coding": [{"code":_row['Data Access'], "display": _row['Data Access'], "system": "/".join([self.SYSTEM_HTAN, "Data_Access"])}]})) + security_label.append(CodeableConcept(**{"coding": [ + {"code": _row['Data Access'], "display": _row['Data Access'], + "system": "/".join([self.SYSTEM_HTAN, "Data_Access"])}]})) parent_data_file = [] if not pd.isnull(_row["Parent Data File ID"]): @@ -722,8 +758,8 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu **{"system": self.SYSTEM_HTAN, "value": _row["Parent Data File ID"], "use": "official"}) parent_document_reference_id = self.mint_id(identifier=parent_document_reference_identifier, - resource_type="DocumentReference", project_id=self.project_id, - namespace=self.NAMESPACE_HTAN) + resource_type="DocumentReference", project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) parent_data_file.append(DocumentReferenceRelatesTo(**{ "code": CodeableConcept(**{"coding": [{"code": "parent_data_file", @@ -732,7 +768,8 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu "target": Reference(**{"reference": f"Documentreference/{parent_document_reference_id}"})})) return DocumentReference(**{"id": document_reference_id, - "identifier": [document_reference_identifier, document_reference_synapse_identifier], + "identifier": [document_reference_identifier, + document_reference_synapse_identifier], "status": "current", "docStatus": "final", "basedOn": based_on, @@ -741,7 +778,8 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu "category": category, "securityLabel": security_label, "content": [DocumentReferenceContent( - **{"attachment": Attachment(**{"title": name, "contentType": _row["mime_type"]}), + **{"attachment": Attachment( + **{"title": name, "contentType": _row["mime_type"]}), "profile": profiles })] }) @@ -749,11 +787,12 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) # 12/14 total Atlas -atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"] +atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", + "Vanderbilt"] # atlas_name = ["OHSU"] for name in atlas_name: - print(f"Processing HTAN atlas {name}") - + print(f"Transforming {name}") + transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) @@ -812,7 +851,8 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu focus=[Reference(**{ "reference": f"Condition/{condition.id}"})], specimen=None, components=None, - category=None) + category=None, + relax=False) if condition_observation: observations.append(condition_observation) @@ -823,19 +863,6 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu if specimen: specimens.append(specimen) - specimen_observation_category = [ - { - "coding": [ - { - "system": "http://terminology.hl7.org/CodeSystem/observation-category", - "code": "laboratory", - "display": "laboratory" - } - ], - "text": "Laboratory" - } - ] - participant_id = specimen_transformer.decipher_htan_id(specimen_row["HTAN Biospecimen ID"])[ "participant_id"] assert participant_id, f"Specimen {specimen_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it." @@ -847,17 +874,41 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu "reference": f"Specimen/{specimen.id}"})], patient_id=specimen_participant_id, specimen=specimen, components=None, - category=specimen_observation_category) + category=transformer.lab_category, + relax=False) if specimen_observation: observations.append(specimen_observation) specimen_ids = [s.id for s in specimens] + patient_ids = [p.id for p in patients] document_references = [] for document_reference_index, document_reference_row in files_drs_meta.iterrows(): - docref = documentreference_transformer.create_document_reference(_row=document_reference_row, specimen_ids=specimen_ids) + docref = documentreference_transformer.create_document_reference(_row=document_reference_row, + specimen_ids=specimen_ids) if docref: document_references.append(docref) + docref_patient_id = None + if 'HTAN Participant ID' in document_reference_row.keys() and pd.isnull(document_reference_row['HTAN Participant ID']): + docref_patient = documentreference_transformer.get_patient_id(participant_id=document_reference_row['HTAN Participant ID']) + if docref_patient in patient_ids: + docref_patient_id = docref_patient + # else: + # print(f"HTAN {name} is missing patient reference in files") + + document_reference_observation = documentreference_transformer.create_observation( + _row=document_reference_row, patient=None, + official_focus="DocumentReference", + focus=[Reference(**{ + "reference": f"DocumentReference/{docref.id}"})], + patient_id=docref_patient_id, + specimen=None, components=None, + category=transformer.lab_category, + relax=True) + + if document_reference_observation: + observations.append(document_reference_observation) + transformer.write_ndjson(research_subjects) transformer.write_ndjson(research_studies) transformer.write_ndjson(patients) From 44b55f94aee030e5d8905379bdbb70e603235b97 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Tue, 8 Oct 2024 07:25:22 -0700 Subject: [PATCH 15/24] ChEMBL query --- fhirizer/utils.py | 88 +++++++++++++++++++++++++++++++++++++++-------- setup.py | 1 + 2 files changed, 74 insertions(+), 15 deletions(-) diff --git a/fhirizer/utils.py b/fhirizer/utils.py index 92f1744..16513ac 100644 --- a/fhirizer/utils.py +++ b/fhirizer/utils.py @@ -2,6 +2,7 @@ import orjson import time import random +import sqlite3 import json import glob import gzip @@ -17,8 +18,10 @@ from fhir.resources import get_fhir_model_class from uuid import uuid5, UUID -DATA_DICT_PATH = "".join([str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'data_dictionary')), "/"]) -FIELDS_PATH = "".join([str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'fields')), "/"]) +DATA_DICT_PATH = "".join( + [str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'data_dictionary')), "/"]) +FIELDS_PATH = "".join( + [str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'fields')), "/"]) package_dir = Path(importlib.resources.files('fhirizer').parent) @@ -205,6 +208,7 @@ def _read_json(path): except json.JSONDecodeError as e: print("Error decoding JSON: {}".format(e)) + # -------------------------------------------------------------------------- # GDC Utility functions # -------------------------------------------------------------------------- @@ -740,7 +744,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose): if shared_keys: shared_keys_items = next(iter(shared_keys)) if verbose: - print(f"======== instance Dict {target_key} ============== case C", "shared_keys: ", shared_keys) + print(f"======== instance Dict {target_key} ============== case C", "shared_keys: ", + shared_keys) if isinstance(data[key][0][shared_keys_items], str) and isinstance( data_to_append[shared_keys_items], str) and data[key][0][shared_keys_items] != \ @@ -752,7 +757,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose): print("Specimen.id" in item.keys()) print(len(item.keys())) - if len(item.keys()) == 1 and "Specimen.id" in list(item.keys())[0] and data_to_append.keys() != item.keys(): + if len(item.keys()) == 1 and "Specimen.id" in list(item.keys())[ + 0] and data_to_append.keys() != item.keys(): # this is where metadata is updated if the head key with Specimen.id exists item.update(data_to_append) reached = True @@ -762,7 +768,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose): # this is where first Specimen.id is appended data[key].append(data_to_append) if verbose: - print(f"======== instance Dict {target_key} ============== case D AFTER", "data[key]: ", data[key], "\n\n") + print(f"======== instance Dict {target_key} ============== case D AFTER", + "data[key]: ", data[key], "\n\n") continue elif isinstance(data[key][0][shared_keys_items], str) and isinstance( @@ -799,7 +806,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose): d.update(data_to_append) continue if verbose: - print(f"======== instance Dict {target_key} ============== case F After", "data[key]: ", data[key], "\n\n") + print(f"======== instance Dict {target_key} ============== case F After", + "data[key]: ", data[key], "\n\n") continue elif (isinstance(data[key][0][shared_keys_items], list) and isinstance( @@ -807,7 +815,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose): not data[key][0][shared_keys_items][0].items() <= data_to_append[shared_keys_items][ 0].items()): if verbose: - print(f"======== instance Dict {target_key} ============== case G", "data[key]: ", data[key]) + print(f"======== instance Dict {target_key} ============== case G", "data[key]: ", + data[key]) if data[key][0]: if len(data[key]) > 1 and len(data[key][-1]) == 1: @@ -820,7 +829,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose): and not data_to_append.items() <= item.items()): item.update(data_to_append) if verbose: - print(f"======== instance Dict {target_key} ============== case H AFTER", "item: ", item, "\n\n") + print(f"======== instance Dict {target_key} ============== case H AFTER", + "item: ", item, "\n\n") continue elif (data[key] and key == "samples" @@ -967,8 +977,8 @@ def make_request(api_url, retries=3): return response.json() else: print(f"Received status code: {response.status_code}. Retrying...") - delay *= 2 ** retries # change delay - time.sleep(delay + random.uniform(0, 1)) # add jitter + delay *= 2 ** retries # change delay + time.sleep(delay + random.uniform(0, 1)) # add jitter raise Exception("Failed to fetch data after multiple retries") @@ -982,7 +992,8 @@ def fetch_cellines(cellosaurus_ids, out_dir): if not os.path.exists(out_dir): os.makedirs(out_dir) - existing_ids = set(os.path.splitext(os.path.basename(file))[0] for file in os.listdir(out_dir) if file.endswith('.json')) + existing_ids = set( + os.path.splitext(os.path.basename(file))[0] for file in os.listdir(out_dir) if file.endswith('.json')) to_fetch_ids = set(cellosaurus_ids) - existing_ids for cellosaurus_id in to_fetch_ids: @@ -1038,8 +1049,8 @@ def cellosaurus_cancer_ids(path, out_path, save=False): # has sex annotation for celline in cl_cancer_depmap: for subset in celline["subset"]: - if subset in ["Female", "Male"]: - ids.append(celline["id"][0]) + if subset in ["Female", "Male"]: + ids.append(celline["id"][0]) # 67763 cell-lines # 62019 cell-lines w gender @@ -1090,7 +1101,8 @@ def get_data_types(data_type): return data_type -def get_component(key, value=None, component_type=None, system="https://cadsr.cancer.gov/sample_laboratory_observation"): +def get_component(key, value=None, component_type=None, + system="https://cadsr.cancer.gov/sample_laboratory_observation"): if component_type == 'string': value = {"valueString": value} elif component_type == 'int': @@ -1188,4 +1200,50 @@ def create_or_extend(new_items, folder_path='META', resource_type='Observation', else: print(f"{file_name} has been extended, without updating existing data.") else: - print(f"{file_name} has been created.") \ No newline at end of file + print(f"{file_name} has been created.") + + +def get_chembl_compound_info(db_file_path: str, drug_names: list, limit: int) -> list: + """Query Chembl COMPOUND_RECORDS by COMPOUND_NAME to make FHIR Substance""" + drug_names_tuple = tuple([x.upper() for x in drug_names]) + + query = f""" + SELECT + a.MOLREGNO, + a.PREF_NAME, + a.CHEMBL_ID, + a.MAX_PHASE, + a.STRUCTURE_TYPE, + c.STANDARD_INCHI, + c.STANDARD_INCHI_KEY, + c.CANONICAL_SMILES, + d.DOC_ID, + d.PUBMED_ID, + d.DOI, + cr.SRC_ID, + cr.SRC_COMPOUND_ID, + sr.SRC_SHORT_NAME, + sr.SRC_DESCRIPTION + FROM + MOLECULE_DICTIONARY as a + LEFT JOIN + COMPOUND_STRUCTURES as c ON a.MOLREGNO = c.MOLREGNO + LEFT JOIN + ACTIVITIES as p ON a.MOLREGNO = p.MOLREGNO + LEFT JOIN + DOCS as d ON p.DOC_ID = d.DOC_ID + LEFT JOIN + compound_records as cr ON a.MOLREGNO = cr.MOLREGNO + LEFT JOIN + source as sr ON cr.SRC_ID = sr.SRC_ID + WHERE cr.COMPOUND_NAME IN {drug_names_tuple} + LIMIT {limit}; + """ + conn = sqlite3.connect(db_file_path) + cursor = conn.cursor() + cursor.execute(query) + rows = cursor.fetchall() + + conn.close() + + return rows diff --git a/setup.py b/setup.py index 9190e4c..92aad84 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ 'inflection', 'iteration_utilities', 'icd10-cm', + 'sqlite3', 'beautifulsoup4', 'gen3-tracker>=0.0.4rc36', 'fhir.resources>=7.1.0' # FHIR® (Release R5, version 5.0.0) From 67677dcb9f9c5ff7f8d2d3339f6d27b6aaa24868 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Tue, 8 Oct 2024 07:27:18 -0700 Subject: [PATCH 16/24] remove lib --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 92aad84..9190e4c 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,6 @@ 'inflection', 'iteration_utilities', 'icd10-cm', - 'sqlite3', 'beautifulsoup4', 'gen3-tracker>=0.0.4rc36', 'fhir.resources>=7.1.0' # FHIR® (Release R5, version 5.0.0) From eb869764720ef920a5473f6026935153c675a59a Mon Sep 17 00:00:00 2001 From: teslajoy Date: Tue, 8 Oct 2024 10:53:56 -0700 Subject: [PATCH 17/24] initial htan medadmin and med --- fhirizer/htan2fhir.py | 160 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 140 insertions(+), 20 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index 1aed72d..ae75619 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -26,6 +26,7 @@ from fhir.resources.observation import Observation from fhir.resources.encounter import Encounter from fhir.resources.codeableconcept import CodeableConcept +from fhir.resources.codeablereference import CodeableReference from fhir.resources.age import Age from fhir.resources.procedure import Procedure from fhir.resources.bodystructure import BodyStructure, BodyStructureIncludedStructure @@ -34,8 +35,11 @@ from fhir.resources.documentreference import DocumentReference, DocumentReferenceContent, \ DocumentReferenceContentProfile, DocumentReferenceRelatesTo from fhir.resources.attachment import Attachment +from fhir.resources.timing import Timing from fhir.resources.medicationadministration import MedicationAdministration -from fhir.resources.medication import Medication +from fhir.resources.medication import Medication, MedicationIngredient +from fhir.resources.substance import Substance, SubstanceIngredient +from fhir.resources.substancedefinition import SubstanceDefinition, SubstanceDefinitionStructure # File data on synapse after authentication @@ -49,6 +53,7 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): self.get_data_type = utils.get_data_types self.get_component = utils.get_component self.fhir_ndjson = utils.fhir_ndjson + self.get_chembl_compound_info = utils.get_chembl_compound_info self.subprogram_name = subprogram_name self.project_id = subprogram_name # incase there will be more granular project/program relations assert Path(out_dir).is_dir(), f"Path to out_dir {out_dir} is not a directory." @@ -72,7 +77,16 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): "text": "Laboratory" } ] - + self.med_admin_code = { + "coding": [ + { + "system": "http://loinc.org", + "code": "80565-5", + "display": "Medication administration record" + } + ], + "text": "Medication administration record" + } parent_researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": "HTAN"}) parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier, resource_type="ResearchStudy", @@ -269,7 +283,8 @@ def decipher_htan_id(_id) -> dict: def create_observation(self, _row: pd.Series, patient: Optional[Patient], patient_id: Optional[str], specimen: Optional[Specimen], official_focus: str, - focus: List[Reference], components: Optional[List], category: Optional[list], relax : bool) -> Observation: + focus: List[Reference], components: Optional[List], category: Optional[list], + relax: bool) -> Observation: # assert patient_id, f"Observation is missing patient id: {patient_id}." # HTAN files doesn't always point to patient assert focus, f"Observation for patient {patient_id} is missing focus." @@ -301,6 +316,11 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien ], "text": "Condition" } + + elif official_focus in ["MedicationAdministration"]: + mappings = transformer.cases_mappings() + code = self.med_admin_code + elif official_focus in ["DocumentReference"]: mappings = transformer.files_mappings() code = { @@ -308,11 +328,12 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien { "system": self.SYSTEM_LOINC, "code": "68992-7", - "display": "Specimen-related information panel" #TODO: find general code + "display": "Specimen-related information panel" # TODO: find general code } ], "text": "Specimen-related information panel" } + elif official_focus in ["Specimen"]: mappings = transformer.biospecimen_mappings() code = { @@ -334,7 +355,7 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien if not relax: _obervation_row = _row[observation_fields] if observation_fields else None else: - _obervation_row = _row # user-specific columns in files - add all to component + _obervation_row = _row # user-specific columns in files - add all to component if _obervation_row is not None: components = [] @@ -628,8 +649,73 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount "stage": [], }) - def create_medication_administration(self) -> MedicationAdministration: - return MedicationAdministration(**{}) + def create_medication_administration(self, _row: pd.Series, patient_id: str) -> dict: + # if Treatment Type exists - make MedicationAdministration + # if Days to Treatment End, then status -> completed, else status unknown + # if Therapeutic Agents is null, then Medication.code -> snomed_code: Unknown 261665006 + # Medication.ingredient.item -> Substance.code -> SubstanceDefination + + status = None + substance_definition = None + substance = None + medication = None + medication_code = None + + if not pd.isnull(_row["Days to Treatment End"]): + status = "completed" + else: + status = "unknown" + + if pd.isnull(_row["Therapeutic Agents"]): + medication_code = CodeableConcept(**{"coding": [{ + "code": "261665006", + "system": self.SYSTEM_SNOME, + "display": "Unknown" + }]}) + else: + # drug_info_df = pd.DataFrame(self.get_chembl_compound_info(db_file_path="./reources/chemble/chembl_34.db", drug_names=list(_row["Theraputic Agent"]))) + medication_code = CodeableConcept(**{"coding": [{"code": _row["Therapeutic Agents"], + "system": self.SYSTEM_HTAN, + "display": _row["Therapeutic Agents"]}]}) + + timing = 0 + if not pd.isnull(_row["Days to Treatment End"]) and not pd.isnull(_row["Days to Treatment Start"]): + timing = int(_row["Days to Treatment End"]) - int(_row["Days to Treatment Start"]) + + # TODO: replace with chembl + # substance_definition = SubstanceDefinition(**{}) + # substance = Substance(**{}) + + medication_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "use": "official", + "value": medication_code.coding[0].display}) + medication_id = self.mint_id(identifier=medication_identifier, + resource_type="Medication", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + + medication = Medication(**{"id": medication_id, "identifier": [medication_identifier], "code": medication_code}) + + medication_admin_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "use": "official", + "value": "-".join([_row["Atlas Name"], _row["HTAN Participant ID"], _row["Treatment Type"]])}) + medication_admin_id = self.mint_id(identifier=medication_admin_identifier, + resource_type="MedicationAdministration", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + data = {"id": medication_admin_id, + "identifier": [medication_admin_identifier], + "status": status, + "occurenceDateTime": "2024-10-8T10:30:00.724446-05:00", + "category": [CodeableConcept(**{"coding": [{"code": _row["Treatment Type"], + "system": "/".join([self.SYSTEM_HTAN,"Treatment_Type"]) , + "display": _row["Treatment Type"]}]})], + "medication": CodeableReference(**{"concept": medication_code, "reference": Reference( + **{"reference": f"Medication/{medication.id}"})}), + "subject": Reference(**{"reference": f"Patient/{patient_id}"})} + medication_admin = MedicationAdministration(**data) + + return {"medication_admin": medication_admin, + "medication": medication, "substance": substance, + "substance_definition": substance_definition} class SpecimenTransformer(HTANTransformer): @@ -780,15 +866,15 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu "content": [DocumentReferenceContent( **{"attachment": Attachment( **{"title": name, "contentType": _row["mime_type"]}), - "profile": profiles - })] + "profile": profiles + })] }) # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) # 12/14 total Atlas atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", - "Vanderbilt"] + "Vanderbilt"] # atlas_name = ["OHSU"] for name in atlas_name: print(f"Transforming {name}") @@ -814,6 +900,8 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu conditions = [] encounters = [] observations = [] + med_admins = [] + med = [] for index, row in cases.iterrows(): research_study = patient_transformer.create_researchstudy(_row=row) @@ -856,6 +944,24 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu if condition_observation: observations.append(condition_observation) + if not pd.isnull(row["Treatment Type"]): + med_admin_dict = patient_transformer.create_medication_administration(_row=row, + patient_id=patient.id) + if med_admin_dict["medication_admin"]: + med_admins.append(med_admin_dict["medication_admin"]) + med_admin_observation = patient_transformer.create_observation(_row=row, patient=None, + official_focus="MedicationAdministration", + focus=[Reference(**{ + "reference": f"MedicationAdministration/{med_admin_dict["medication_admin"].id}"})], + patient_id=patient.id, + specimen=None, components=None, + category=None, + relax=False) + if med_admin_observation: + observations.append(med_admin_observation) + if med_admin_dict["medication"]: + med.append(med_admin_dict["medication"]) + specimens = [] for specimen_index, specimen_row in htan_biospecimens.iterrows(): # specimen_row = htan_biospecimens.iloc[specimen_index] @@ -889,8 +995,10 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu document_references.append(docref) docref_patient_id = None - if 'HTAN Participant ID' in document_reference_row.keys() and pd.isnull(document_reference_row['HTAN Participant ID']): - docref_patient = documentreference_transformer.get_patient_id(participant_id=document_reference_row['HTAN Participant ID']) + if 'HTAN Participant ID' in document_reference_row.keys() and pd.isnull( + document_reference_row['HTAN Participant ID']): + docref_patient = documentreference_transformer.get_patient_id( + participant_id=document_reference_row['HTAN Participant ID']) if docref_patient in patient_ids: docref_patient_id = docref_patient # else: @@ -909,14 +1017,26 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu if document_reference_observation: observations.append(document_reference_observation) - transformer.write_ndjson(research_subjects) - transformer.write_ndjson(research_studies) - transformer.write_ndjson(patients) - transformer.write_ndjson(encounters) - transformer.write_ndjson(conditions) - transformer.write_ndjson(observations) - transformer.write_ndjson(specimens) - transformer.write_ndjson(document_references) + if research_subjects: + transformer.write_ndjson(research_subjects) + if research_studies: + transformer.write_ndjson(research_studies) + if patients: + transformer.write_ndjson(patients) + if encounters: + transformer.write_ndjson(encounters) + if conditions: + transformer.write_ndjson(conditions) + if observations: + transformer.write_ndjson(observations) + if specimens: + transformer.write_ndjson(specimens) + if document_references: + transformer.write_ndjson(document_references) + if med_admins: + transformer.write_ndjson(med_admins) + if med: + transformer.write_ndjson(med) # participant ids from specimen identifiers # print(transformer.decipher_htan_id(htan_biospecimens["HTAN Biospecimen ID"][0])) From bac208473bd5bb8ef42310476af381bf67121425 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Wed, 9 Oct 2024 09:28:19 -0700 Subject: [PATCH 18/24] medadmin - med - substance - substancedefinition --- fhirizer/htan2fhir.py | 202 +++++++++++++++++++++++----- fhirizer/utils.py | 17 +-- scripts/gdc_scan.py | 297 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 468 insertions(+), 48 deletions(-) create mode 100644 scripts/gdc_scan.py diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index ae75619..5fe4846 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -39,7 +39,7 @@ from fhir.resources.medicationadministration import MedicationAdministration from fhir.resources.medication import Medication, MedicationIngredient from fhir.resources.substance import Substance, SubstanceIngredient -from fhir.resources.substancedefinition import SubstanceDefinition, SubstanceDefinitionStructure +from fhir.resources.substancedefinition import SubstanceDefinition,SubstanceDefinitionStructure, SubstanceDefinitionStructureRepresentation, SubstanceDefinitionName # File data on synapse after authentication @@ -62,6 +62,7 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): self.SYSTEM_HTAN = 'https://data.humantumoratlas.org' self.SYSTEM_SNOME = 'http://snomed.info/sct' self.SYSTEM_LOINC = 'http://loinc.org' + self.SYSTEM_chEMBL = 'https://www.ebi.ac.uk/chembl' self.NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, self.SYSTEM_HTAN) self.read_json = utils._read_json self.fhir_ndjson = utils.fhir_ndjson @@ -408,6 +409,87 @@ def get_patient_id(self, participant_id) -> str: patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id, namespace=self.NAMESPACE_HTAN) return patient_id + @staticmethod + def create_substance_definition_representations(df: pd.DataFrame) -> list: + representations = [] + for index, _row in df.iterrows(): + if pd.notna(_row['STANDARD_INCHI']): + representations.append(SubstanceDefinitionStructureRepresentation( + **{"representation": _row['STANDARD_INCHI'], + "format": CodeableConcept(**{"coding": [{"code": "InChI", + "system": 'http://hl7.org/fhir/substance-representation-format', + "display": "InChI"}]})})) + + if pd.notna(_row['CANONICAL_SMILES']): + representations.append(SubstanceDefinitionStructureRepresentation( + **{"representation": _row['CANONICAL_SMILES'], + "format": CodeableConcept(**{"coding": [{"code": "SMILES", + "system": 'http://hl7.org/fhir/substance-representation-format', + "display": "SMILES"}]})})) + return representations + + def create_substance_definition(self, compound_name: str, representations: list) -> SubstanceDefinition: + sub_def_identifier = Identifier(**{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"}) + sub_def_id = self.mint_id(identifier=sub_def_identifier, resource_type="SubstanceDefinition", project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + + return SubstanceDefinition(**{"id": sub_def_id, + "identifier": [sub_def_identifier], + "structure": SubstanceDefinitionStructure(**{"representation": representations}), + "name": [SubstanceDefinitionName(**{"name": compound_name})] + }) + + def create_substance(self, compound_name:str, substance_definition: SubstanceDefinition) -> Substance: + code = None + if substance_definition: + code = CodeableReference( + **{"concept": CodeableConcept(**{"coding": [{"code": compound_name, "system": "/".join([self.SYSTEM_chEMBL, "compound_name"]), "display": compound_name}]}), + "reference": Reference(**{"reference": f"SubstanceDefinition/{substance_definition.id}"})}) + + sub_identifier = Identifier( + **{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"}) + sub_id = self.mint_id(identifier=sub_identifier, resource_type="Substance", + project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + + return Substance(**{"id": sub_id, + "identifier": [sub_identifier], + "instance": True, # place-holder + "category": [CodeableConcept(**{"coding": [{"code": "drug", + "system": "http://terminology.hl7.org/CodeSystem/substance-category", + "display": "Drug or Medicament"}]})], + "code": code}) + + def create_medication(self, compound_name: Optional[str], treatment_type: Optional[str], _substance: Optional[Substance]) -> Medication: + code = None + med_identifier = None + if compound_name: + code = CodeableConcept(**{"coding": [ + {"code": compound_name, "system": "/".join([self.SYSTEM_chEMBL, "compound_name"]), + "display": compound_name}]}) + + med_identifier = Identifier( + **{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"}) + else: + code = CodeableConcept(**{"coding": [ + {"code": treatment_type, "system": "/".join([self.SYSTEM_HTAN, "treatment_type"]), + "display": treatment_type}]}) + + med_identifier = Identifier( + **{"system": self.SYSTEM_HTAN, "value": treatment_type, "use": "official"}) + + med_id = self.mint_id(identifier=med_identifier, resource_type="Medication", + project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) + + ingredients = [] + if _substance: + ingredients.append(MedicationIngredient(**{"item": CodeableReference(**{"reference": Reference(**{"reference": f"Substance/{_substance.id}"})})})) + + return Medication(**{"id": med_id, + "identifier": [med_identifier], + "code": code, + "ingredient": ingredients}) def write_ndjson(self, entities): resource_type = entities[0].resource_type @@ -415,6 +497,68 @@ def write_ndjson(self, entities): entities = list({v['id']: v for v in entities}.values()) utils.fhir_ndjson(entities, "".join([self.out_dir, "/", resource_type, ".ndjson"])) + def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.DataFrame: + # create medication placeholder for cases where treatment type is defined ex chemo, but medication is not documented + # MedicationAdministration - Medication - Substance - SubstanceDefinition + drugname_fhir_ids = {} + substance_definitions = [] + substances = [] + medications = [] + if not cases["Therapeutic Agents"].isnull().all(): + cases["Therapeutic Agents"] = cases["Therapeutic Agents"].str.upper() + drug_names = list(cases["Therapeutic Agents"][~cases["Therapeutic Agents"].isna()].unique()) + # drug_names = [d.upper() for d in drug_names] + dat = self.get_chembl_compound_info(db_file_path=db_file_path, drug_names=drug_names, limit=1000) + drug_df = pd.DataFrame(dat) + drug_df.columns = ["CHEMBL_ID", "STANDARD_INCHI", "CANONICAL_SMILES", "COMPOUND_NAME"] + + for drug in drug_names: + drug_info = drug_df[drug_df.COMPOUND_NAME.isin([drug])] + drug_info["has_info"] = drug_info[['STANDARD_INCHI', 'CANONICAL_SMILES']].notna().any(axis=1) + if drug_info["has_info"].any(): + drug_representations = self.create_substance_definition_representations(drug_info) + substance_definition = self.create_substance_definition(compound_name=drug, + representations=drug_representations) + + if substance_definition: + substance_definitions.append(substance_definition) + + substance = self.create_substance(compound_name=drug, substance_definition=substance_definition) + + if substance: + substances.append(substance) + medication = self.create_medication(compound_name=drug, _substance=substance, treatment_type=None) + if medication: + medications.append(medication) + drugname_fhir_ids.update({drug: medication.id}) + + else: + medication = self.create_medication(compound_name=drug, _substance=None, treatment_type=None) + medications.append(medication) + drugname_fhir_ids.update({drug: medication.id}) + + if substance_definitions: + transformer.write_ndjson(substance_definitions) + if substances: + transformer.write_ndjson(substances) + + cases['Medication_ID'] = cases['Therapeutic Agents'].map(drugname_fhir_ids, na_action='ignore') + + for index, row in cases.iterrows(): + if pd.isnull(row["Therapeutic Agents"]) and not pd.isnull(row["Treatment Type"]): + medication_agent = self.create_medication(compound_name=None, _substance=None, treatment_type=row["Treatment Type"]) + if medication_agent: + medications.append(medication_agent) + cases.loc[index, 'Medication_ID'] = medication_agent.id + + if row['Therapeutic Agents'] in drugname_fhir_ids.keys(): + cases.loc[index, 'Medication_ID'] = drugname_fhir_ids[row['Therapeutic Agents']] + + if medications: + transformer.write_ndjson(medications) + if 'Medication_ID' in cases.columns: + return cases + class PatientTransformer(HTANTransformer): def __init__(self, *args: Any, **kwargs: Any): @@ -649,7 +793,7 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount "stage": [], }) - def create_medication_administration(self, _row: pd.Series, patient_id: str) -> dict: + def create_medication_administration(self, _row: pd.Series, patient_id: str) -> MedicationAdministration: # if Treatment Type exists - make MedicationAdministration # if Days to Treatment End, then status -> completed, else status unknown # if Therapeutic Agents is null, then Medication.code -> snomed_code: Unknown 261665006 @@ -682,19 +826,6 @@ def create_medication_administration(self, _row: pd.Series, patient_id: str) -> if not pd.isnull(_row["Days to Treatment End"]) and not pd.isnull(_row["Days to Treatment Start"]): timing = int(_row["Days to Treatment End"]) - int(_row["Days to Treatment Start"]) - # TODO: replace with chembl - # substance_definition = SubstanceDefinition(**{}) - # substance = Substance(**{}) - - medication_identifier = Identifier( - **{"system": self.SYSTEM_HTAN, "use": "official", - "value": medication_code.coding[0].display}) - medication_id = self.mint_id(identifier=medication_identifier, - resource_type="Medication", - project_id=self.project_id, namespace=self.NAMESPACE_HTAN) - - medication = Medication(**{"id": medication_id, "identifier": [medication_identifier], "code": medication_code}) - medication_admin_identifier = Identifier( **{"system": self.SYSTEM_HTAN, "use": "official", "value": "-".join([_row["Atlas Name"], _row["HTAN Participant ID"], _row["Treatment Type"]])}) @@ -706,16 +837,13 @@ def create_medication_administration(self, _row: pd.Series, patient_id: str) -> "status": status, "occurenceDateTime": "2024-10-8T10:30:00.724446-05:00", "category": [CodeableConcept(**{"coding": [{"code": _row["Treatment Type"], - "system": "/".join([self.SYSTEM_HTAN,"Treatment_Type"]) , + "system": "/".join([self.SYSTEM_HTAN, "Treatment_Type"]) , "display": _row["Treatment Type"]}]})], "medication": CodeableReference(**{"concept": medication_code, "reference": Reference( - **{"reference": f"Medication/{medication.id}"})}), + **{"reference": f"Medication/{_row['Medication_ID']}"})}), "subject": Reference(**{"reference": f"Patient/{patient_id}"})} - medication_admin = MedicationAdministration(**data) - return {"medication_admin": medication_admin, - "medication": medication, "substance": substance, - "substance_definition": substance_definition} + return MedicationAdministration(**data) class SpecimenTransformer(HTANTransformer): @@ -873,9 +1001,14 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) # 12/14 total Atlas -atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", - "Vanderbilt"] +# TNP_SARDANA drug name syntax error +atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", + "Vanderbilt"] + # atlas_name = ["OHSU"] +db_path = '../../bmeg_backup_0516/bmeg-etl_chembl/source/chembl/chembl_34/chembl_34_sqlite/chembl_34.db' + + for name in atlas_name: print(f"Transforming {name}") @@ -901,7 +1034,10 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu encounters = [] observations = [] med_admins = [] - med = [] + + if not cases["Therapeutic Agents"].isnull().all() or not cases["Treatment Type"].isnull().all(): + cases = transformer.transform_medication(cases, db_file_path=db_path) + for index, row in cases.iterrows(): research_study = patient_transformer.create_researchstudy(_row=row) @@ -945,22 +1081,20 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu observations.append(condition_observation) if not pd.isnull(row["Treatment Type"]): - med_admin_dict = patient_transformer.create_medication_administration(_row=row, - patient_id=patient.id) - if med_admin_dict["medication_admin"]: - med_admins.append(med_admin_dict["medication_admin"]) + med_admin = patient_transformer.create_medication_administration(_row=row, + patient_id=patient.id) + if med_admin: + med_admins.append(med_admin) med_admin_observation = patient_transformer.create_observation(_row=row, patient=None, official_focus="MedicationAdministration", focus=[Reference(**{ - "reference": f"MedicationAdministration/{med_admin_dict["medication_admin"].id}"})], + "reference": f"MedicationAdministration/{med_admin.id}"})], patient_id=patient.id, specimen=None, components=None, category=None, relax=False) if med_admin_observation: observations.append(med_admin_observation) - if med_admin_dict["medication"]: - med.append(med_admin_dict["medication"]) specimens = [] for specimen_index, specimen_row in htan_biospecimens.iterrows(): @@ -1035,9 +1169,11 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu transformer.write_ndjson(document_references) if med_admins: transformer.write_ndjson(med_admins) - if med: - transformer.write_ndjson(med) + # participant ids from specimen identifiers # print(transformer.decipher_htan_id(htan_biospecimens["HTAN Biospecimen ID"][0])) # print(transformer.decipher_htan_id(cases["HTAN Participant ID"][0])) + +# make all possible medications in cases +# point to the medication in MedicationAdministartion - Medication -> Substance -> SubstanceDefination 0...* [representaiton.format representation.string] diff --git a/fhirizer/utils.py b/fhirizer/utils.py index 16513ac..e236b2f 100644 --- a/fhirizer/utils.py +++ b/fhirizer/utils.py @@ -1208,30 +1208,17 @@ def get_chembl_compound_info(db_file_path: str, drug_names: list, limit: int) -> drug_names_tuple = tuple([x.upper() for x in drug_names]) query = f""" - SELECT - a.MOLREGNO, - a.PREF_NAME, + SELECT DISTINCT a.CHEMBL_ID, - a.MAX_PHASE, - a.STRUCTURE_TYPE, c.STANDARD_INCHI, - c.STANDARD_INCHI_KEY, c.CANONICAL_SMILES, - d.DOC_ID, - d.PUBMED_ID, - d.DOI, - cr.SRC_ID, - cr.SRC_COMPOUND_ID, - sr.SRC_SHORT_NAME, - sr.SRC_DESCRIPTION + cr.COMPOUND_NAME FROM MOLECULE_DICTIONARY as a LEFT JOIN COMPOUND_STRUCTURES as c ON a.MOLREGNO = c.MOLREGNO LEFT JOIN ACTIVITIES as p ON a.MOLREGNO = p.MOLREGNO - LEFT JOIN - DOCS as d ON p.DOC_ID = d.DOC_ID LEFT JOIN compound_records as cr ON a.MOLREGNO = cr.MOLREGNO LEFT JOIN diff --git a/scripts/gdc_scan.py b/scripts/gdc_scan.py new file mode 100644 index 0000000..9f53ea0 --- /dev/null +++ b/scripts/gdc_scan.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python + +import argparse +import json +import logging +import os +import sys +import shutil +import time +from tqdm import tqdm + +import requests + +URL_BASE = "https://api.gdc.cancer.gov/" +TOKEN = None +client = requests + + +def get_file(file_id, path): + """ download a file from gdc, save in path """ + if os.path.isfile(path): + return path + endpoint = 'data/{}'.format(file_id) + req = client.get(URL_BASE + endpoint) + if not os.path.exists( os.path.dirname(path) ): + os.makedirs(os.path.dirname(path)) + with open(path, 'wb') as out: + out.write(req.content) + return path + + +def query_gdc(endpoint, params): + """ + query_gdc makes a query to the GDC API while handling common issues + like pagination, retries, etc. + + The return value is an iterator. + """ + # Copy input params to avoid modification. + params = dict(params) + page_size = 100 + params['size'] = page_size + + # With a GET request, the filters parameter needs to be converted + # from a dictionary to JSON-formatted string + if 'filters' in params: + params['filters'] = json.dumps(params['filters']) + + headers = None + if TOKEN is not None: + headers = { + "X-Auth-Token" : TOKEN + } + failCount = 0 + # Iterate through all the pages. + with tqdm(total=page_size) as pbar: + while True: + try: + req = client.get(URL_BASE + endpoint, params=params, headers=headers) + data = req.json() + + if 'data' not in data: + print("Bad return %s" % (data)) + failCount += 1 + if failCount >= 10: + raise Exception("Too many failures") + time.sleep(10) + else: + failCount = 0 + data = data['data'] + hits = data.get("hits", []) + if len(hits) == 0: + return + for hit in hits: + yield hit + pbar.total = data['pagination']['total'] + pbar.update( data['pagination']['count'] ) + # Get the next page. + params['from'] = data['pagination']['from'] + page_size + except Exception as e: + if failCount >= 10: + logging.warning(str(e)) + logging.warning(json.dumps(params)) + raise + failCount += 1 + print("Connection Issue %s" % (e)) + time.sleep(10) + +# The GDC API requires you to request that nested fields be expanded. +# https://docs.gdc.cancer.gov/API/Users_Guide/Appendix_A_Available_Fields/#cases-field-groups +# +# Note that (as of this writing) we are expanding most but +# not all possible fields. Mostly we're skipping "files" data. +expand_case_fields = ",".join(""" +demographic +diagnoses +diagnoses.treatments +exposures +family_histories +project +project.program +samples +samples.annotations +samples.portions +samples.portions.analytes +samples.portions.analytes.aliquots +samples.portions.analytes.aliquots.annotations +samples.portions.analytes.aliquots.center +samples.portions.analytes.annotations +samples.portions.annotations +samples.portions.center +samples.portions.slides +samples.portions.slides.annotations +summary +summary.data_categories +summary.experimental_strategies +tissue_source_site +type +""".strip().split()) + +# These are the fields we want to keep from the GDC Case (BMEG Case). +keep_case_fields = """ +diagnoses +demographic +disease_type +primary_site +summary +project +""".strip().split() + +expand_project_fields = ",".join(""" +dbgap_accession_number +disease_type +name +primary_site +project_id +released +state +program +summary +""".strip().split()) + + +def scrapeProjects(outfile): + projectOut = open(outfile, "w") + for row in query_gdc("projects", {"expand": expand_project_fields}): + projectOut.write(json.dumps(row)) + projectOut.write("\n") + projectOut.close() + + +def scrapeCases(outfile): + # Crawl all cases, samples, aliquots to generate + # BMEG Cases, Samples, and Aliquots. + parameters={} + parameters['expand'] = expand_case_fields + case_gids = [] + caseOut = open(outfile, "w") + + for row in query_gdc("cases", parameters): + caseOut.write(json.dumps(row)) + caseOut.write("\n") + + caseOut.close() + +def scrapeCompounds(outdir): + """ the only way to get drugs is to download files and parse them""" + my_filters = json.loads(""" + {"op":"and","content":[{"op":"in","content":{"field":"files.data_type","value":["Clinical data"]}},{"op":"in","content":{"field":"files.tags","value":["drug"]}}]} + """) + + parameters = {'filters' : my_filters} + for row in query_gdc("legacy/files", parameters): + get_file(row['file_id'], '{}/{}.tsv'.format(outdir, row['file_id'])) + +def scrapeFiles(outfile): + parameters={} + parameters['expand'] = ",".join(["cases", "cases.aliquot_ids", "cases.project", "cases.samples.portions.analytes.aliquots", "index_files"]) + + filesOut = open(outfile, "w") + + for row in query_gdc("files", parameters): + filesOut.write(json.dumps(row)) + filesOut.write("\n") + filesOut.close() + +def scrapeExpression(outdir): + parameters = { "filters" : { + "op" : "and", + "content":[{ + "op" : "in", + "content": { + "field" : "data_category", + "value":["Transcriptome Profiling"] + } + },{ + "op" : "in", + "content": { + "field" : "access", + "value":["open"] + } + },{ + "op" : "in", + "content" : { + "field" : "experimental_strategy", + "value":["RNA-Seq"] + } + }] + } } + for row in query_gdc("files", parameters): + outPath = '{}/{}.tsv'.format(outdir, row['file_id']) + if not os.path.exists(outPath): + get_file(row['file_id'], outPath + ".tmp" ) + shutil.move(outPath + ".tmp", outPath) + #print(row) + + +def scrapeOpenMaf(outdir): + parameters = { "filters" : { + "op" : "and", + "content":[{ + "op" : "in", + "content": { + "field" : "data_category", + "value":["Simple Nucleotide Variation"] + } + },{ + "op" : "in", + "content": { + "field" : "access", + "value":["open"] + } + }] + } } + for row in query_gdc("files", parameters): + outPath = '{}/{}.maf.gz'.format(outdir, row['file_id']) + if not os.path.exists(outPath): + get_file(row['file_id'], outPath + ".tmp" ) + shutil.move(outPath + ".tmp", outPath) + #print(row) + +def scrapeControlledMaf(outdir): + parameters = { "filters" : { + "op" : "and", + "content":[{ + "op" : "in", + "content": { + "field" : "data_category", + "value":["Simple Nucleotide Variation"] + } + },{ + "op" : "in", + "content": { + "field" : "access", + "value":["controlled"] + } + }] + } } + for row in query_gdc("files", parameters): + outPath = '{}/{}.maf.gz'.format(outdir, row['file_id']) + if not os.path.exists(outPath): + get_file(row['file_id'], outPath + ".tmp" ) + shutil.move(outPath + ".tmp", outPath) + #print(row) + + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("-e", "--endpoint", default=URL_BASE) + parser.add_argument("-t", "--token", default=None) + parser.add_argument("method") + parser.add_argument("dest") + + args = parser.parse_args() + + URL_BASE = args.endpoint + if args.token is not None: + with open(args.token, "rt") as handle: + TOKEN = handle.read().strip() + + if args.method == "projects": + scrapeProjects(args.dest) + if args.method == "cases": + scrapeCases(args.dest) + if args.method == "files": + scrapeFiles(args.dest) + if args.method == "compounds": + scrapeCompounds(args.dest) + if args.method == "expression": + scrapeExpression(args.dest) + if args.method == "open-maf": + scrapeOpenMaf(args.dest) + if args.method == "controlled-maf": + scrapeControlledMaf(args.dest) From 05a3c0d5ddf0ea988d56828b252a5ea241ff22f7 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Wed, 9 Oct 2024 10:50:11 -0700 Subject: [PATCH 19/24] htan cli --- fhirizer/cli.py | 20 ++- fhirizer/htan2fhir.py | 363 +++++++++++++++++++++--------------------- 2 files changed, 191 insertions(+), 192 deletions(-) diff --git a/fhirizer/cli.py b/fhirizer/cli.py index 69e0189..209a42f 100644 --- a/fhirizer/cli.py +++ b/fhirizer/cli.py @@ -1,5 +1,4 @@ -from fhirizer import utils, mapping, entity2fhir -from fhirizer import icgc2fhir +from fhirizer import utils, mapping, entity2fhir, icgc2fhir, htan2fhir import click from pathlib import Path @@ -141,10 +140,10 @@ def convert(name, in_path, out_path, verbose): show_default=True, help='entity name to map - project, case, file of GDC or cellosaurus') @click.option('--out_dir', cls=NotRequiredIf, - not_required_if='icgc', + not_required_if='htan', help='Directory path to save mapped FHIR ndjson files.') @click.option('--entity_path', cls=NotRequiredIf, - not_required_if='icgc', + not_required_if='htan', help='Path to GDC entity with mapped FHIR like keys (converted file via convert). ' 'or Cellosaurus ndjson file of human cell-lines of interest') @click.option('--icgc', help='Name of the ICGC project to FHIRize.') @@ -153,10 +152,13 @@ def convert(name, in_path, out_path, verbose): @click.option('--convert', is_flag=True, help='Boolean indicating to write converted keys to directory') @click.option('--verbose', is_flag=True) def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose): - name_list = ['project', 'case', 'file', 'cellosaurus', 'icgc'] - assert name in ['project', 'case', 'file', 'cellosaurus', 'icgc'], f'--name is not in {name_list}.' - assert Path(out_dir).is_dir(), f"Path {out_dir} is not a valid directory path." - assert Path(entity_path).is_file(), f"Path {entity_path} is not a valid file path." + name_list = ['project', 'case', 'file', 'cellosaurus', 'icgc', 'htan'] + assert name in name_list, f'--name is not in {name_list}.' + if name != 'htan': + assert Path(out_dir).is_dir(), f"Path {out_dir} is not a valid directory path." + assert Path(entity_path).is_file(), f"Path {entity_path} is not a valid file path." + else: + assert Path("./projects/HTAN").is_dir() if name in 'project': entity2fhir.project_gdc_to_fhir_ndjson(out_dir=out_dir, projects_path=entity_path, convert=convert, verbose=verbose) @@ -168,6 +170,8 @@ def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose): entity2fhir.cellosaurus2fhir(out_dir=out_dir, path=entity_path) if name in 'icgc' and icgc: icgc2fhir.icgc2fhir(project_name=icgc, has_files=has_files) + if name in 'htan': + htan2fhir.htan2fhir(verbose=verbose) if __name__ == '__main__': diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index 5fe4846..edce3a9 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -1,5 +1,6 @@ import uuid import json +import warnings import numpy as np import orjson @@ -306,7 +307,7 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien observation_fields = [] if official_focus in ["Patient", "Condition"]: - mappings = transformer.cases_mappings() + mappings = self.cases_mappings() code = { "coding": [ { @@ -319,11 +320,11 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien } elif official_focus in ["MedicationAdministration"]: - mappings = transformer.cases_mappings() + mappings = self.cases_mappings() code = self.med_admin_code elif official_focus in ["DocumentReference"]: - mappings = transformer.files_mappings() + mappings = self.files_mappings() code = { "coding": [ { @@ -336,7 +337,7 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien } elif official_focus in ["Specimen"]: - mappings = transformer.biospecimen_mappings() + mappings = self.biospecimen_mappings() code = { "coding": [ { @@ -538,9 +539,9 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat drugname_fhir_ids.update({drug: medication.id}) if substance_definitions: - transformer.write_ndjson(substance_definitions) + self.write_ndjson(substance_definitions) if substances: - transformer.write_ndjson(substances) + self.write_ndjson(substances) cases['Medication_ID'] = cases['Therapeutic Agents'].map(drugname_fhir_ids, na_action='ignore') @@ -555,7 +556,7 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat cases.loc[index, 'Medication_ID'] = drugname_fhir_ids[row['Therapeutic Agents']] if medications: - transformer.write_ndjson(medications) + self.write_ndjson(medications) if 'Medication_ID' in cases.columns: return cases @@ -610,7 +611,7 @@ def create_patient(self, _row: pd.Series) -> Patient: def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation: patient_observation_fields = [] - for field, fhir_map, use, focus in self.get_fields_by_fhir_map(transformer.cases_mappings(), + for field, fhir_map, use, focus in self.get_fields_by_fhir_map(self.cases_mappings(), "Observation.component"): if focus == "Patient": patient_observation_fields.append(field) @@ -1001,179 +1002,173 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) # 12/14 total Atlas -# TNP_SARDANA drug name syntax error -atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", - "Vanderbilt"] - -# atlas_name = ["OHSU"] -db_path = '../../bmeg_backup_0516/bmeg-etl_chembl/source/chembl/chembl_34/chembl_34_sqlite/chembl_34.db' - - -for name in atlas_name: - print(f"Transforming {name}") - - transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False) - patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", - verbose=False) - specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", - verbose=False) - documentreference_transformer = DocumentReferenceTransformer(subprogram_name=name, - out_dir=f"./projects/HTAN/{name}/META", - verbose=False) - - patient_demographics_df = transformer.patient_demographics - cases = transformer.cases - htan_biospecimens = transformer.biospecimens - files = transformer.files - files_drs_meta = transformer.files_drs_meta - - patients = [] - research_studies = [] - research_subjects = [] - conditions = [] - encounters = [] - observations = [] - med_admins = [] - - if not cases["Therapeutic Agents"].isnull().all() or not cases["Treatment Type"].isnull().all(): - cases = transformer.transform_medication(cases, db_file_path=db_path) - - for index, row in cases.iterrows(): - research_study = patient_transformer.create_researchstudy(_row=row) - - if research_study: - research_studies.append(transformer.program_research_study) - research_studies.append(research_study) - - patient_row = cases.iloc[index][patient_demographics_df.columns] - patient = patient_transformer.create_patient(_row=patient_row) - patient_obs = patient_transformer.patient_observation(patient=patient, _row=row) - if patient_obs: - observations.append(patient_obs) - if patient: - patients.append(patient) - # print(f"HTAN FHIR Patient: {patient.json()}") - # print(f"HTAN FHIR Patient Observation: {patient_obs.json()}") - - research_subject = patient_transformer.create_researchsubject(patient, research_study) - if research_subject: - research_subjects.append(research_subject) - - encounter = patient_transformer.create_encounter(_row=row, patient=patient, condition=None, - procedure=None) - if encounter: - encounters.append(encounter) - condition = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter, - body_structure=None) - - if condition: - conditions.append(condition) - - condition_observation = patient_transformer.create_observation(_row=row, patient=patient, - patient_id=patient.id, - official_focus="Condition", - focus=[Reference(**{ - "reference": f"Condition/{condition.id}"})], - specimen=None, components=None, - category=None, - relax=False) - if condition_observation: - observations.append(condition_observation) - - if not pd.isnull(row["Treatment Type"]): - med_admin = patient_transformer.create_medication_administration(_row=row, - patient_id=patient.id) - if med_admin: - med_admins.append(med_admin) - med_admin_observation = patient_transformer.create_observation(_row=row, patient=None, - official_focus="MedicationAdministration", - focus=[Reference(**{ - "reference": f"MedicationAdministration/{med_admin.id}"})], - patient_id=patient.id, - specimen=None, components=None, - category=None, - relax=False) - if med_admin_observation: - observations.append(med_admin_observation) - - specimens = [] - for specimen_index, specimen_row in htan_biospecimens.iterrows(): - # specimen_row = htan_biospecimens.iloc[specimen_index] - specimen = specimen_transformer.create_specimen(_row=specimen_row) - if specimen: - specimens.append(specimen) - - participant_id = specimen_transformer.decipher_htan_id(specimen_row["HTAN Biospecimen ID"])[ - "participant_id"] - assert participant_id, f"Specimen {specimen_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it." - - specimen_participant_id = specimen_transformer.get_patient_id(participant_id=participant_id) - specimen_observation = specimen_transformer.create_observation(_row=specimen_row, patient=None, - official_focus="Specimen", - focus=[Reference(**{ - "reference": f"Specimen/{specimen.id}"})], - patient_id=specimen_participant_id, - specimen=specimen, components=None, - category=transformer.lab_category, - relax=False) - if specimen_observation: - observations.append(specimen_observation) - - specimen_ids = [s.id for s in specimens] - patient_ids = [p.id for p in patients] - document_references = [] - for document_reference_index, document_reference_row in files_drs_meta.iterrows(): - docref = documentreference_transformer.create_document_reference(_row=document_reference_row, - specimen_ids=specimen_ids) - if docref: - document_references.append(docref) - - docref_patient_id = None - if 'HTAN Participant ID' in document_reference_row.keys() and pd.isnull( - document_reference_row['HTAN Participant ID']): - docref_patient = documentreference_transformer.get_patient_id( - participant_id=document_reference_row['HTAN Participant ID']) - if docref_patient in patient_ids: - docref_patient_id = docref_patient - # else: - # print(f"HTAN {name} is missing patient reference in files") - - document_reference_observation = documentreference_transformer.create_observation( - _row=document_reference_row, patient=None, - official_focus="DocumentReference", - focus=[Reference(**{ - "reference": f"DocumentReference/{docref.id}"})], - patient_id=docref_patient_id, - specimen=None, components=None, - category=transformer.lab_category, - relax=True) - - if document_reference_observation: - observations.append(document_reference_observation) - - if research_subjects: - transformer.write_ndjson(research_subjects) - if research_studies: - transformer.write_ndjson(research_studies) - if patients: - transformer.write_ndjson(patients) - if encounters: - transformer.write_ndjson(encounters) - if conditions: - transformer.write_ndjson(conditions) - if observations: - transformer.write_ndjson(observations) - if specimens: - transformer.write_ndjson(specimens) - if document_references: - transformer.write_ndjson(document_references) - if med_admins: - transformer.write_ndjson(med_admins) - - - # participant ids from specimen identifiers - # print(transformer.decipher_htan_id(htan_biospecimens["HTAN Biospecimen ID"][0])) - # print(transformer.decipher_htan_id(cases["HTAN Participant ID"][0])) - -# make all possible medications in cases -# point to the medication in MedicationAdministartion - Medication -> Substance -> SubstanceDefination 0...* [representaiton.format representation.string] +def htan2fhir(verbose): + warnings.filterwarnings('ignore') + atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", + "Vanderbilt"] + # TNP_SARDANA drug name syntax error + db_path = str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db')) + + for name in atlas_name: + if verbose: + print(f"Transforming {name}") + + transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=verbose) + patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", + verbose=verbose) + specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", + verbose=verbose) + documentreference_transformer = DocumentReferenceTransformer(subprogram_name=name, + out_dir=f"./projects/HTAN/{name}/META", + verbose=verbose) + + patient_demographics_df = transformer.patient_demographics + cases = transformer.cases + htan_biospecimens = transformer.biospecimens + files = transformer.files + files_drs_meta = transformer.files_drs_meta + + patients = [] + research_studies = [] + research_subjects = [] + conditions = [] + encounters = [] + observations = [] + med_admins = [] + + if not cases["Therapeutic Agents"].isnull().all() or not cases["Treatment Type"].isnull().all(): + cases = transformer.transform_medication(cases, db_file_path=db_path) + + for index, row in cases.iterrows(): + research_study = patient_transformer.create_researchstudy(_row=row) + + if research_study: + research_studies.append(transformer.program_research_study) + research_studies.append(research_study) + + patient_row = cases.iloc[index][patient_demographics_df.columns] + patient = patient_transformer.create_patient(_row=patient_row) + patient_obs = patient_transformer.patient_observation(patient=patient, _row=row) + if patient_obs: + observations.append(patient_obs) + if patient: + patients.append(patient) + # print(f"HTAN FHIR Patient: {patient.json()}") + # print(f"HTAN FHIR Patient Observation: {patient_obs.json()}") + + research_subject = patient_transformer.create_researchsubject(patient, research_study) + if research_subject: + research_subjects.append(research_subject) + + encounter = patient_transformer.create_encounter(_row=row, patient=patient, condition=None, + procedure=None) + if encounter: + encounters.append(encounter) + condition = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter, + body_structure=None) + + if condition: + conditions.append(condition) + + condition_observation = patient_transformer.create_observation(_row=row, patient=patient, + patient_id=patient.id, + official_focus="Condition", + focus=[Reference(**{ + "reference": f"Condition/{condition.id}"})], + specimen=None, components=None, + category=None, + relax=False) + if condition_observation: + observations.append(condition_observation) + + if not pd.isnull(row["Treatment Type"]): + med_admin = patient_transformer.create_medication_administration(_row=row, + patient_id=patient.id) + if med_admin: + med_admins.append(med_admin) + med_admin_observation = patient_transformer.create_observation(_row=row, patient=None, + official_focus="MedicationAdministration", + focus=[Reference(**{ + "reference": f"MedicationAdministration/{med_admin.id}"})], + patient_id=patient.id, + specimen=None, components=None, + category=None, + relax=False) + if med_admin_observation: + observations.append(med_admin_observation) + + specimens = [] + for specimen_index, specimen_row in htan_biospecimens.iterrows(): + # specimen_row = htan_biospecimens.iloc[specimen_index] + specimen = specimen_transformer.create_specimen(_row=specimen_row) + if specimen: + specimens.append(specimen) + + participant_id = specimen_transformer.decipher_htan_id(specimen_row["HTAN Biospecimen ID"])[ + "participant_id"] + assert participant_id, f"Specimen {specimen_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it." + + specimen_participant_id = specimen_transformer.get_patient_id(participant_id=participant_id) + specimen_observation = specimen_transformer.create_observation(_row=specimen_row, patient=None, + official_focus="Specimen", + focus=[Reference(**{ + "reference": f"Specimen/{specimen.id}"})], + patient_id=specimen_participant_id, + specimen=specimen, components=None, + category=transformer.lab_category, + relax=False) + if specimen_observation: + observations.append(specimen_observation) + + specimen_ids = [s.id for s in specimens] + patient_ids = [p.id for p in patients] + document_references = [] + for document_reference_index, document_reference_row in files_drs_meta.iterrows(): + docref = documentreference_transformer.create_document_reference(_row=document_reference_row, + specimen_ids=specimen_ids) + if docref: + document_references.append(docref) + + docref_patient_id = None + if 'HTAN Participant ID' in document_reference_row.keys() and pd.isnull( + document_reference_row['HTAN Participant ID']): + docref_patient = documentreference_transformer.get_patient_id( + participant_id=document_reference_row['HTAN Participant ID']) + if docref_patient in patient_ids: + docref_patient_id = docref_patient + # else: + # print(f"HTAN {name} is missing patient reference in files") + + document_reference_observation = documentreference_transformer.create_observation( + _row=document_reference_row, patient=None, + official_focus="DocumentReference", + focus=[Reference(**{ + "reference": f"DocumentReference/{docref.id}"})], + patient_id=docref_patient_id, + specimen=None, components=None, + category=transformer.lab_category, + relax=True) + + if document_reference_observation: + observations.append(document_reference_observation) + + if research_subjects: + transformer.write_ndjson(research_subjects) + if research_studies: + transformer.write_ndjson(research_studies) + if patients: + transformer.write_ndjson(patients) + if encounters: + transformer.write_ndjson(encounters) + if conditions: + transformer.write_ndjson(conditions) + if observations: + transformer.write_ndjson(observations) + if specimens: + transformer.write_ndjson(specimens) + if document_references: + transformer.write_ndjson(document_references) + if med_admins: + transformer.write_ndjson(med_admins) + +# for i in $(ls projects/HTAN); do g3t meta validate projects/HTAN/$i/META; done \ No newline at end of file From 57bbf0d7274b9949797111e715d41fe85ca5e3e8 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Wed, 9 Oct 2024 11:11:15 -0700 Subject: [PATCH 20/24] updated readme --- README.md | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3968812..e5a89e6 100644 --- a/README.md +++ b/README.md @@ -77,8 +77,33 @@ Detailed step-by-step guide on FHIRizing data for a project's study can be found ``` - HTAN +FHIRizing HTAN depends on the: +1. Folder hierarchy with naming conventions as below and existance of raw data pulled from HTAN +``` +fhirizer/ +|-- projects/ +| └── HTAN/ +| └── OHSU/ +| |-- raw/ +| | |-- files/ +| | | |-- table_data.tsv +| | | └── cds_manifest.csv +| | |-- biospecimens/table_data.tsv +| | └── cases/table_data.tsv +| └── META/ +``` +2. existance of chembl DB file +``` +fhirizer/ +|-- resources/ + └── chembl_resources/chembl_34.db + +``` + +Example run: + ``` - fhirizer generate --name htan --out_dir ./projects//META --entity_path ./projects// + fhirizer generate --name htan ``` ### Constructing GDC maps cli cmds From 683335deb74dd393a352fb7de6ba3c0d30e3a519 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Thu, 10 Oct 2024 06:08:31 -0700 Subject: [PATCH 21/24] pass list to notrequired cli options - check for db file --- fhirizer/cli.py | 19 +++++++++++-------- fhirizer/htan2fhir.py | 1 + 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fhirizer/cli.py b/fhirizer/cli.py index 209a42f..4f0324b 100644 --- a/fhirizer/cli.py +++ b/fhirizer/cli.py @@ -7,21 +7,23 @@ class NotRequiredIf(click.Option): def __init__(self, *args, **kwargs): self.not_required_if = kwargs.pop('not_required_if') assert self.not_required_if, "'not_required_if' parameter required" + if isinstance(self.not_required_if, str): + self.not_required_if = [self.not_required_if] kwargs['help'] = (kwargs.get('help', '') + ' NOTE: This argument is mutually exclusive with %s' % - self.not_required_if + ', '.join(self.not_required_if) ).strip() super(NotRequiredIf, self).__init__(*args, **kwargs) def handle_parse_result(self, ctx, opts, args): we_are_present = self.name in opts - other_present = self.not_required_if in opts + others_present = [opt for opt in self.not_required_if if opt in opts] - if other_present: + if others_present: if we_are_present: raise click.UsageError( "Illegal usage: `%s` is mutually exclusive with `%s`" % ( - self.name, self.not_required_if)) + self.name, ', '.join(others_present))) else: self.prompt = None @@ -29,6 +31,7 @@ def handle_parse_result(self, ctx, opts, args): ctx, opts, args) + @click.group() def cli(): """GDC, Cellosaurus, ICGC to FHIR schema Key and Content Mapping""" @@ -140,12 +143,12 @@ def convert(name, in_path, out_path, verbose): show_default=True, help='entity name to map - project, case, file of GDC or cellosaurus') @click.option('--out_dir', cls=NotRequiredIf, - not_required_if='htan', + not_required_if=['htan', 'icgc'], help='Directory path to save mapped FHIR ndjson files.') @click.option('--entity_path', cls=NotRequiredIf, - not_required_if='htan', - help='Path to GDC entity with mapped FHIR like keys (converted file via convert). ' - 'or Cellosaurus ndjson file of human cell-lines of interest') + not_required_if=['htan', 'icgc'], + help='Path to GDC entity with mapped FHIR like keys (converted file via convert) or Cellosaurus ndjson ' + 'file of human cell-lines of interest.') @click.option('--icgc', help='Name of the ICGC project to FHIRize.') @click.option('--has_files', is_flag=True, help='Boolean indicating file metatda via new argo site is available @ ' 'ICGC/{project}/data directory to FHIRize.') diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index edce3a9..04c4b75 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -1008,6 +1008,7 @@ def htan2fhir(verbose): "Vanderbilt"] # TNP_SARDANA drug name syntax error db_path = str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db')) + assert Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db').is_file(), f"chEMBL db file chembl_34.db does not exist." for name in atlas_name: if verbose: From f1bebefb97a91eb5a34fac2a64649fc536a8077c Mon Sep 17 00:00:00 2001 From: teslajoy Date: Thu, 10 Oct 2024 08:52:12 -0700 Subject: [PATCH 22/24] htan condition stage --- fhirizer/htan2fhir.py | 142 +++++++++++++++++++++++++++++++++--------- 1 file changed, 111 insertions(+), 31 deletions(-) diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index 04c4b75..e2e63a5 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -40,7 +40,8 @@ from fhir.resources.medicationadministration import MedicationAdministration from fhir.resources.medication import Medication, MedicationIngredient from fhir.resources.substance import Substance, SubstanceIngredient -from fhir.resources.substancedefinition import SubstanceDefinition,SubstanceDefinitionStructure, SubstanceDefinitionStructureRepresentation, SubstanceDefinitionName +from fhir.resources.substancedefinition import SubstanceDefinition, SubstanceDefinitionStructure, \ + SubstanceDefinitionStructureRepresentation, SubstanceDefinitionName # File data on synapse after authentication @@ -80,14 +81,14 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool): } ] self.med_admin_code = { - "coding": [ - { + "coding": [ + { "system": "http://loinc.org", "code": "80565-5", "display": "Medication administration record" - } - ], - "text": "Medication administration record" + } + ], + "text": "Medication administration record" } parent_researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": "HTAN"}) parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier, @@ -410,6 +411,7 @@ def get_patient_id(self, participant_id) -> str: patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id, namespace=self.NAMESPACE_HTAN) return patient_id + @staticmethod def create_substance_definition_representations(df: pd.DataFrame) -> list: representations = [] @@ -431,7 +433,8 @@ def create_substance_definition_representations(df: pd.DataFrame) -> list: def create_substance_definition(self, compound_name: str, representations: list) -> SubstanceDefinition: sub_def_identifier = Identifier(**{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"}) - sub_def_id = self.mint_id(identifier=sub_def_identifier, resource_type="SubstanceDefinition", project_id=self.project_id, + sub_def_id = self.mint_id(identifier=sub_def_identifier, resource_type="SubstanceDefinition", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) return SubstanceDefinition(**{"id": sub_def_id, @@ -440,18 +443,20 @@ def create_substance_definition(self, compound_name: str, representations: list) "name": [SubstanceDefinitionName(**{"name": compound_name})] }) - def create_substance(self, compound_name:str, substance_definition: SubstanceDefinition) -> Substance: + def create_substance(self, compound_name: str, substance_definition: SubstanceDefinition) -> Substance: code = None if substance_definition: code = CodeableReference( - **{"concept": CodeableConcept(**{"coding": [{"code": compound_name, "system": "/".join([self.SYSTEM_chEMBL, "compound_name"]), "display": compound_name}]}), + **{"concept": CodeableConcept(**{"coding": [ + {"code": compound_name, "system": "/".join([self.SYSTEM_chEMBL, "compound_name"]), + "display": compound_name}]}), "reference": Reference(**{"reference": f"SubstanceDefinition/{substance_definition.id}"})}) sub_identifier = Identifier( **{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"}) sub_id = self.mint_id(identifier=sub_identifier, resource_type="Substance", - project_id=self.project_id, - namespace=self.NAMESPACE_HTAN) + project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) return Substance(**{"id": sub_id, "identifier": [sub_identifier], @@ -461,7 +466,8 @@ def create_substance(self, compound_name:str, substance_definition: SubstanceDef "display": "Drug or Medicament"}]})], "code": code}) - def create_medication(self, compound_name: Optional[str], treatment_type: Optional[str], _substance: Optional[Substance]) -> Medication: + def create_medication(self, compound_name: Optional[str], treatment_type: Optional[str], + _substance: Optional[Substance]) -> Medication: code = None med_identifier = None if compound_name: @@ -480,12 +486,13 @@ def create_medication(self, compound_name: Optional[str], treatment_type: Option **{"system": self.SYSTEM_HTAN, "value": treatment_type, "use": "official"}) med_id = self.mint_id(identifier=med_identifier, resource_type="Medication", - project_id=self.project_id, - namespace=self.NAMESPACE_HTAN) + project_id=self.project_id, + namespace=self.NAMESPACE_HTAN) ingredients = [] if _substance: - ingredients.append(MedicationIngredient(**{"item": CodeableReference(**{"reference": Reference(**{"reference": f"Substance/{_substance.id}"})})})) + ingredients.append(MedicationIngredient(**{ + "item": CodeableReference(**{"reference": Reference(**{"reference": f"Substance/{_substance.id}"})})})) return Medication(**{"id": med_id, "identifier": [med_identifier], @@ -519,7 +526,7 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat if drug_info["has_info"].any(): drug_representations = self.create_substance_definition_representations(drug_info) substance_definition = self.create_substance_definition(compound_name=drug, - representations=drug_representations) + representations=drug_representations) if substance_definition: substance_definitions.append(substance_definition) @@ -528,7 +535,8 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat if substance: substances.append(substance) - medication = self.create_medication(compound_name=drug, _substance=substance, treatment_type=None) + medication = self.create_medication(compound_name=drug, _substance=substance, + treatment_type=None) if medication: medications.append(medication) drugname_fhir_ids.update({drug: medication.id}) @@ -547,7 +555,8 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat for index, row in cases.iterrows(): if pd.isnull(row["Therapeutic Agents"]) and not pd.isnull(row["Treatment Type"]): - medication_agent = self.create_medication(compound_name=None, _substance=None, treatment_type=row["Treatment Type"]) + medication_agent = self.create_medication(compound_name=None, _substance=None, + treatment_type=row["Treatment Type"]) if medication_agent: medications.append(medication_agent) cases.loc[index, 'Medication_ID'] = medication_agent.id @@ -731,7 +740,7 @@ def create_body_structure(self, _row, patient: Patient) -> BodyStructure: }) def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encounter, - body_structure: Optional[BodyStructure]) -> Optional[Condition]: + body_structure: Optional[BodyStructure], stage_observation: Optional[Observation]) -> Optional[Condition]: primary_diagnosis = _row.get("Primary Diagnosis") if pd.isnull(primary_diagnosis): return None @@ -776,6 +785,10 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount patient_body_site_cc = [CodeableConcept(**{"coding": [{"code": patient_body_site, "system": self.SYSTEM_HTAN, "display": patient_body_site}]})] + condition_stage = [] + stages = self.create_stage(_row=_row, stage_observation=None) + if stages: + condition_stage = stages return Condition(**{"id": condition_id, "identifier": [condition_identifier], @@ -791,7 +804,7 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount "bodySite": patient_body_site_cc, # "bodyStructure": patient_body_structure_ref, "encounter": Reference(**{"reference": f"Encounter/{encounter.id}"}), - "stage": [], + "stage": condition_stage, }) def create_medication_administration(self, _row: pd.Series, patient_id: str) -> MedicationAdministration: @@ -838,7 +851,7 @@ def create_medication_administration(self, _row: pd.Series, patient_id: str) -> "status": status, "occurenceDateTime": "2024-10-8T10:30:00.724446-05:00", "category": [CodeableConcept(**{"coding": [{"code": _row["Treatment Type"], - "system": "/".join([self.SYSTEM_HTAN, "Treatment_Type"]) , + "system": "/".join([self.SYSTEM_HTAN, "Treatment_Type"]), "display": _row["Treatment Type"]}]})], "medication": CodeableReference(**{"concept": medication_code, "reference": Reference( **{"reference": f"Medication/{_row['Medication_ID']}"})}), @@ -846,6 +859,66 @@ def create_medication_administration(self, _row: pd.Series, patient_id: str) -> return MedicationAdministration(**data) + def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation]) -> list: + + assessment = [] + if stage_observation: + assessment.append(Reference(**{"reference": f"Observation/{stage_observation.id}"})) + + # find fields w Condition.stage.summary mappings + cancer_pathological_staging = utils._read_json(str(Path(importlib.resources.files( + 'fhirizer').parent / 'resources' / 'gdc_resources' / 'content_annotations' / 'diagnosis' / 'cancer_pathological_staging.json'))) + + stage_fields = [] + for field, fhir_map, use, focus in self.get_fields_by_fhir_map(self.cases_mappings(), + "Condition.stage.summary"): + if "Tumor Grade" in field or "AJCC Pathologic" in field: + stage_fields.append(field) + + if stage_fields: + _stage_df = _row[stage_fields] + + stages = [] + for stage_field in stage_fields: + if not pd.isnull(_row[stage_field]): + + types = [] + summaries = [] + for stage_info in cancer_pathological_staging: + if _row[stage_field] == stage_info["value"]: + type_system = {"code": stage_info["stage_type_sctid"], + "system": self.SYSTEM_SNOME, + "display": stage_info["stage_type_sctid_display"]} + + summary_htan_system = {"code": _row[stage_field], + "system": "/".join([self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]), + "display": _row[stage_field]} + + summary_snomed_system = {"code": stage_info["sctid"], + "system": self.SYSTEM_SNOME, + "display": stage_info["sctid_display"]} + + types.append(type_system) + summaries.append(summary_htan_system) + summaries.append(summary_snomed_system) + if not types: + types.append({"code": "_".join(stage_field.lower().split(" ")), + "system": "/".join([self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]), + "display": "_".join(stage_field.lower().split(" "))}) + + summaries.append({"code": _row[stage_field], + "system": "/".join([self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]), + "display": _row[stage_field]}) + + condition_stage = ConditionStage( + **{"summary": CodeableConcept(**{"coding": summaries}), + "assessment": assessment, + "type": CodeableConcept(**{"coding": types})}) + if condition_stage: + stages.append(condition_stage) + + return stages + class SpecimenTransformer(HTANTransformer): def __init__(self, *args: Any, **kwargs: Any): @@ -1002,15 +1075,20 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024) # 12/14 total Atlas -def htan2fhir(verbose): +def htan2fhir(verbose, entity_atlas_name): warnings.filterwarnings('ignore') - atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", - "Vanderbilt"] + + atlas_names = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", + "Vanderbilt", "TNP_SARDANA"] + assert entity_atlas_name not in atlas_names, f"Please provide a valid HTAN Atlas name in: {atlas_names}" + # TNP_SARDANA drug name syntax error - db_path = str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db')) - assert Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db').is_file(), f"chEMBL db file chembl_34.db does not exist." + db_path = str( + Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db')) + assert Path(importlib.resources.files( + 'fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db').is_file(), f"chEMBL db file chembl_34.db does not exist." - for name in atlas_name: + for name in entity_atlas_name: if verbose: print(f"Transforming {name}") @@ -1066,7 +1144,7 @@ def htan2fhir(verbose): if encounter: encounters.append(encounter) condition = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter, - body_structure=None) + body_structure=None, stage_observation=None) if condition: conditions.append(condition) @@ -1076,7 +1154,8 @@ def htan2fhir(verbose): official_focus="Condition", focus=[Reference(**{ "reference": f"Condition/{condition.id}"})], - specimen=None, components=None, + specimen=None, + components=None, category=None, relax=False) if condition_observation: @@ -1092,7 +1171,8 @@ def htan2fhir(verbose): focus=[Reference(**{ "reference": f"MedicationAdministration/{med_admin.id}"})], patient_id=patient.id, - specimen=None, components=None, + specimen=None, + components=None, category=None, relax=False) if med_admin_observation: @@ -1172,4 +1252,4 @@ def htan2fhir(verbose): if med_admins: transformer.write_ndjson(med_admins) -# for i in $(ls projects/HTAN); do g3t meta validate projects/HTAN/$i/META; done \ No newline at end of file +# for i in $(ls projects/HTAN); do g3t meta validate projects/HTAN/$i/META; done From cc2ebf941e3102b8ec8e8628f8fa922faf2434a0 Mon Sep 17 00:00:00 2001 From: teslajoy Date: Thu, 10 Oct 2024 13:08:02 -0700 Subject: [PATCH 23/24] stage observations + check medication code --- README.md | 6 ++ fhirizer/htan2fhir.py | 173 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 160 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index e5a89e6..bfbba2f 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,12 @@ Example run: ``` fhirizer generate --name htan ``` + +G3T validate FHIRized ndjson files: +```commandline +for i in $(ls projects/HTAN); do echo $i && g3t meta validate projects/HTAN/$i/META; done +``` + ### Constructing GDC maps cli cmds initialize initial structure of project, case, or file to add Maps diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py index e2e63a5..8dd1167 100644 --- a/fhirizer/htan2fhir.py +++ b/fhirizer/htan2fhir.py @@ -470,7 +470,10 @@ def create_medication(self, compound_name: Optional[str], treatment_type: Option _substance: Optional[Substance]) -> Medication: code = None med_identifier = None + if compound_name: + if ":" in compound_name: + compound_name.replace(":", "_") code = CodeableConcept(**{"coding": [ {"code": compound_name, "system": "/".join([self.SYSTEM_chEMBL, "compound_name"]), "display": compound_name}]}) @@ -478,6 +481,9 @@ def create_medication(self, compound_name: Optional[str], treatment_type: Option med_identifier = Identifier( **{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"}) else: + if ":" in treatment_type: + treatment_type.replace(":", "_") + code = CodeableConcept(**{"coding": [ {"code": treatment_type, "system": "/".join([self.SYSTEM_HTAN, "treatment_type"]), "display": treatment_type}]}) @@ -516,6 +522,11 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat cases["Therapeutic Agents"] = cases["Therapeutic Agents"].str.upper() drug_names = list(cases["Therapeutic Agents"][~cases["Therapeutic Agents"].isna()].unique()) # drug_names = [d.upper() for d in drug_names] + + for drug_name in drug_names: + if ":" in drug_name: + drug_name.replace(":", "_") + dat = self.get_chembl_compound_info(db_file_path=db_file_path, drug_names=drug_names, limit=1000) drug_df = pd.DataFrame(dat) drug_df.columns = ["CHEMBL_ID", "STANDARD_INCHI", "CANONICAL_SMILES", "COMPOUND_NAME"] @@ -740,10 +751,10 @@ def create_body_structure(self, _row, patient: Patient) -> BodyStructure: }) def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encounter, - body_structure: Optional[BodyStructure], stage_observation: Optional[Observation]) -> Optional[Condition]: + body_structure: Optional[BodyStructure], stage_observation: Optional[Observation]) -> dict: primary_diagnosis = _row.get("Primary Diagnosis") if pd.isnull(primary_diagnosis): - return None + return {} # identifier string = project / patient / primary diagnosis condition_identifier = Identifier(**{"system": self.SYSTEM_HTAN, @@ -785,12 +796,8 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount patient_body_site_cc = [CodeableConcept(**{"coding": [{"code": patient_body_site, "system": self.SYSTEM_HTAN, "display": patient_body_site}]})] - condition_stage = [] - stages = self.create_stage(_row=_row, stage_observation=None) - if stages: - condition_stage = stages - return Condition(**{"id": condition_id, + condition = Condition(**{"id": condition_id, "identifier": [condition_identifier], "code": CodeableConcept(**{"coding": [{"code": primary_diagnosis, "system": self.SYSTEM_HTAN, @@ -803,10 +810,20 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount "recordedDate": recorded_date, "bodySite": patient_body_site_cc, # "bodyStructure": patient_body_structure_ref, - "encounter": Reference(**{"reference": f"Encounter/{encounter.id}"}), - "stage": condition_stage, + "encounter": Reference(**{"reference": f"Encounter/{encounter.id}"}) }) + stage_observations_dict = self.create_stage_observation(_row=_row, condition=condition, patient=patient) + + condition_stage = [] + stages = self.create_stage(_row=_row, stage_observations_dict=stage_observations_dict) + if stages: + condition_stage = stages + + condition.stage = condition_stage + + return {"condition": condition, "stage_observations_dict": stage_observations_dict} + def create_medication_administration(self, _row: pd.Series, patient_id: str) -> MedicationAdministration: # if Treatment Type exists - make MedicationAdministration # if Days to Treatment End, then status -> completed, else status unknown @@ -859,11 +876,8 @@ def create_medication_administration(self, _row: pd.Series, patient_id: str) -> return MedicationAdministration(**data) - def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation]) -> list: - + def create_stage(self, _row: pd.Series, stage_observations_dict: dict) -> list: assessment = [] - if stage_observation: - assessment.append(Reference(**{"reference": f"Observation/{stage_observation.id}"})) # find fields w Condition.stage.summary mappings cancer_pathological_staging = utils._read_json(str(Path(importlib.resources.files( @@ -873,6 +887,7 @@ def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation] for field, fhir_map, use, focus in self.get_fields_by_fhir_map(self.cases_mappings(), "Condition.stage.summary"): if "Tumor Grade" in field or "AJCC Pathologic" in field: + # TODO: check for 8th/other edition stage_fields.append(field) if stage_fields: @@ -880,6 +895,10 @@ def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation] stages = [] for stage_field in stage_fields: + stage_name = "_".join(stage_field.lower().split(" ")) + stage_observation = stage_observations_dict.get(stage_name) + if stage_observation: + assessment = [Reference(**{"reference": f"Observation/{stage_observation.id}"})] if not pd.isnull(_row[stage_field]): types = [] @@ -891,7 +910,8 @@ def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation] "display": stage_info["stage_type_sctid_display"]} summary_htan_system = {"code": _row[stage_field], - "system": "/".join([self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]), + "system": "/".join( + [self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]), "display": _row[stage_field]} summary_snomed_system = {"code": stage_info["sctid"], @@ -903,7 +923,7 @@ def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation] summaries.append(summary_snomed_system) if not types: types.append({"code": "_".join(stage_field.lower().split(" ")), - "system": "/".join([self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]), + "system": "/".join([self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]), "display": "_".join(stage_field.lower().split(" "))}) summaries.append({"code": _row[stage_field], @@ -919,6 +939,116 @@ def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation] return stages + def create_stage_observation(self, _row: pd.Series, condition: Condition, patient: Patient) -> dict: + observation_dict = {} + + # find fields w Condition.stage.summary mappings + cancer_pathological_staging = utils._read_json(str(Path(importlib.resources.files( + 'fhirizer').parent / 'resources' / 'gdc_resources' / 'content_annotations' / 'diagnosis' / 'cancer_pathological_staging.json'))) + + ajcc_pathologic_stage_fields = [] + grade_stage_fields = [] + for field, fhir_map, use, focus in self.get_fields_by_fhir_map(self.cases_mappings(), + "Condition.stage.summary"): + if "AJCC Pathologic" in field: + # TODO: check for 8th/other edition + ajcc_pathologic_stage_fields.append(field) + elif "Tumor Grade" in field: + grade_stage_fields.append(field) + + _ajcc_pathologic_stage = None + if pd.notna(ajcc_pathologic_stage_fields).all(): + _ajcc_pathologic_stage = _row[ajcc_pathologic_stage_fields] + + member = [] + if pd.notna(ajcc_pathologic_stage_fields).all(): + # print(_ajcc_pathologic_stage, type(_ajcc_pathologic_stage)) + for col_name, value in _ajcc_pathologic_stage.items(): + # "these are children stages and are members" + if value and col_name != "AJCC Pathologic Stage": + stage = "_".join(col_name.lower().split(" ")) + identifier_value = "-".join([patient.identifier[0].value, condition.id, stage]) + observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN, + "use": "official", + "value": identifier_value}) + observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + + code = None + value_code = None + for stage_info in cancer_pathological_staging: + if value == stage_info["value"]: + code = CodeableConcept(**{"coding": [{"code": stage_info["stage_type_sctid"], + "system": self.SYSTEM_SNOME, + "display": stage_info["stage_type_sctid_display"]}]}) + + value_code = CodeableConcept(**{"coding": [{"code": stage_info["sctid"], + "system": self.SYSTEM_SNOME, + "display": stage_info["sctid_display"]}]}) + if not code: + code = CodeableConcept(**{"coding": [{"code": stage, + "system": "/".join([self.SYSTEM_HTAN, stage]), + "display": stage}]}) + + value_code = CodeableConcept(**{"coding": [{"code": value, + "system": "/".join( + [self.SYSTEM_HTAN, stage]), + "display": value}]}) + + _stage_observation = Observation(**{"id": observation_id, + "identifier": [observation_identifier], + "status": "final", + "code": code, + "subject": Reference(**{"reference": f"Patient/{patient.id}"}), + "focus": [ + Reference(**{"reference": f"Condition/{condition.id}"})], + "valueCodeableConcept": value_code}) + observation_dict.update({stage: _stage_observation}) + member.append(Reference(**{"reference": f"Observation/{_stage_observation.id}"})) + # print(member) + + if not pd.isnull(_row["AJCC Pathologic Stage"]): + stage = "_".join("AJCC Pathologic Stage".lower().split(" ")) + identifier_value = "-".join([patient.identifier[0].value, condition.id, stage]) + observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN, + "use": "official", + "value": identifier_value}) + observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation", + project_id=self.project_id, namespace=self.NAMESPACE_HTAN) + + code = None + value_code = None + for stage_info in cancer_pathological_staging: + if _row["AJCC Pathologic Stage"] == stage_info["value"]: + code = CodeableConcept(**{"coding": [{"code": stage_info["stage_type_sctid"], + "system": self.SYSTEM_SNOME, + "display": stage_info["stage_type_sctid_display"]}]}) + + value_code = CodeableConcept(**{"coding": [{"code": stage_info["sctid"], + "system": self.SYSTEM_SNOME, + "display": stage_info["sctid_display"]}]}) + if not code: + code = CodeableConcept(**{"coding": [{"code": stage, + "system": "/".join([self.SYSTEM_HTAN, stage]), + "display": stage}]}) + + value_code = CodeableConcept(**{"coding": [{"code": _row["AJCC Pathologic Stage"], + "system": "/".join( + [self.SYSTEM_HTAN, stage]), + "display": _row["AJCC Pathologic Stage"]}]}) + + _stage_observation = Observation(**{"id": observation_id, + "identifier": [observation_identifier], + "status": "final", + "code": code, + "subject": Reference(**{"reference": f"Patient/{patient.id}"}), + "focus": [Reference(**{"reference": f"Condition/{condition.id}"})], + "valueCodeableConcept": value_code, + "hasMember": member}) + observation_dict.update({stage: _stage_observation}) + + return observation_dict + class SpecimenTransformer(HTANTransformer): def __init__(self, *args: Any, **kwargs: Any): @@ -1143,17 +1273,22 @@ def htan2fhir(verbose, entity_atlas_name): procedure=None) if encounter: encounters.append(encounter) - condition = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter, + condition_dict = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter, body_structure=None, stage_observation=None) - if condition: - conditions.append(condition) + if condition_dict and condition_dict["condition"]: + conditions.append(condition_dict["condition"]) + + if condition_dict["stage_observations_dict"]: + for key, obs_item in condition_dict["stage_observations_dict"].items(): + if obs_item: + observations.append(obs_item) condition_observation = patient_transformer.create_observation(_row=row, patient=patient, patient_id=patient.id, official_focus="Condition", focus=[Reference(**{ - "reference": f"Condition/{condition.id}"})], + "reference": f"Condition/{condition_dict["condition"].id}"})], specimen=None, components=None, category=None, From ba2650c1f0e9690f07f139cb544668ac769da82a Mon Sep 17 00:00:00 2001 From: teslajoy Date: Fri, 11 Oct 2024 05:47:51 -0700 Subject: [PATCH 24/24] enable one-many atalas names to be passed --- README.md | 5 +++++ fhirizer/cli.py | 15 +++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bfbba2f..bf26321 100644 --- a/README.md +++ b/README.md @@ -102,9 +102,14 @@ fhirizer/ Example run: +for all available atlases under ./projects/HTAN/ ``` fhirizer generate --name htan ``` +or for one or more: +```commandline +fhirizer generate --name htan --atlas "OHSU,DFCI,WUSTL,BU,CHOP" +``` G3T validate FHIRized ndjson files: ```commandline diff --git a/fhirizer/cli.py b/fhirizer/cli.py index 4f0324b..bdaab62 100644 --- a/fhirizer/cli.py +++ b/fhirizer/cli.py @@ -149,12 +149,16 @@ def convert(name, in_path, out_path, verbose): not_required_if=['htan', 'icgc'], help='Path to GDC entity with mapped FHIR like keys (converted file via convert) or Cellosaurus ndjson ' 'file of human cell-lines of interest.') +@click.option('--atlas', required=False, + default=['OHSU'], + show_default=True, + help='List of atlas project(s) name to FHIRize. ex. ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford"]') @click.option('--icgc', help='Name of the ICGC project to FHIRize.') @click.option('--has_files', is_flag=True, help='Boolean indicating file metatda via new argo site is available @ ' 'ICGC/{project}/data directory to FHIRize.') @click.option('--convert', is_flag=True, help='Boolean indicating to write converted keys to directory') @click.option('--verbose', is_flag=True) -def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose): +def generate(name, out_dir, entity_path, icgc, has_files, atlas, convert, verbose): name_list = ['project', 'case', 'file', 'cellosaurus', 'icgc', 'htan'] assert name in name_list, f'--name is not in {name_list}.' if name != 'htan': @@ -174,7 +178,14 @@ def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose): if name in 'icgc' and icgc: icgc2fhir.icgc2fhir(project_name=icgc, has_files=has_files) if name in 'htan': - htan2fhir.htan2fhir(verbose=verbose) + if isinstance(atlas, str): + if "," in atlas: + atlas = atlas.split(",") + atlas = [a.strip() for a in atlas] + else: + atlas = [atlas] + + htan2fhir.htan2fhir(entity_atlas_name=atlas, verbose=verbose) if __name__ == '__main__':