From b4d8a8205c78c384f91f89b6adf4b35e4fce7692 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Tue, 1 Oct 2024 09:50:34 -0700
Subject: [PATCH 01/24] HTAN schema translation to FHIR

---
 README.md                                  |  23 +-
 fhirizer/htan2fhir.py                      | 106 +++
 fhirizer/utils.py                          |  16 +
 resources/htan_resources/biospecimens.json | 406 +++++++++
 resources/htan_resources/cases.json        | 937 +++++++++++++++++++++
 resources/htan_resources/files.json        | 450 ++++++++++
 setup.py                                   |   2 +-
 7 files changed, 1935 insertions(+), 5 deletions(-)
 create mode 100644 fhirizer/htan2fhir.py
 create mode 100644 resources/htan_resources/biospecimens.json
 create mode 100644 resources/htan_resources/cases.json
 create mode 100644 resources/htan_resources/files.json
diff --git a/README.md b/README.md
index 0d83bcb..d4eec77 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
 
 
 ### Project overview:
-Transforms and harmonizes data from Genomic Data Commons (GDC), Cellosaurus cell-lines, and International Cancer Genome Consortium (ICGC) repositories into 🔥 FHIR (Fast Healthcare Interoperability Resources) format.
+Transforms and harmonizes data from Genomic Data Commons (GDC), Cellosaurus cell-lines, International Cancer Genome Consortium (ICGC), and Human Tumor Atlas Network (HTAN) repositories into 🔥 FHIR (Fast Healthcare Interoperability Resources) format.
 
 - #### GDC study simplified FHIR graph 
 ![mapping](./imgs/gdc_tcga_study_example_fhir_graph.png)
@@ -75,6 +75,11 @@ Detailed step-by-step guide on FHIRizing data for a project's study can be found
   ```
    fhirizer generate --name icgc --icgc <ICGC_project_name> --has_files
   ```
+- HTAN
+  
+  ```
+   fhirizer generate --name htan --out_dir ./projects/<my-project>/META --entity_path ./projects/<my-project>/
+  ```
 ### Constructing GDC maps cli cmds 
 
 initialize initial structure of project, case, or file to add Maps
@@ -145,9 +150,19 @@ fhirizer/
 |   |           |-- filess.ndjson
 |   |           └── META/
 |   └── ICGC/
-|         └── ICGC-STUDY/ 
-|                |-- data/
-|                └── META/
+|   |     └── ICGC-STUDY/ 
+|   |            |-- data/
+|   |            └── META/
+|   └── HTAN/ 
+|   |     └── OHSU/
+|   |           └── Breast_NOS/
+|   |                 |-- raw/ 
+|   |                 |    |--  files/
+|   |                 |    |--  biospecimens/
+|   |                 |    └──  cases/
+|   |                 └── META/
+|   |           
+|   |           
 |--README.md
 └── setup.py
 ```
diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
new file mode 100644
index 0000000..19ad40a
--- /dev/null
+++ b/fhirizer/htan2fhir.py
@@ -0,0 +1,106 @@
+import uuid
+import json
+import orjson
+import copy
+import glob
+import pathlib
+import inflection
+import itertools
+import pandas as pd
+from fhirizer import utils
+from pathlib import Path
+import importlib.resources
+from uuid import uuid3, NAMESPACE_DNS
+
+from fhir.resources.reference import Reference
+from fhir.resources.identifier import Identifier
+from fhir.resources.codeableconcept import CodeableConcept
+from fhir.resources.patient import Patient
+from fhir.resources.researchstudy import ResearchStudy
+from fhir.resources.researchsubject import ResearchSubject
+from fhir.resources.specimen import Specimen, SpecimenProcessing, SpecimenCollection
+from fhir.resources.condition import Condition, ConditionStage
+from fhir.resources.documentreference import DocumentReference, DocumentReferenceContent, \
+    DocumentReferenceContentProfile
+from fhir.resources.attachment import Attachment
+from fhir.resources.observation import Observation
+from fhir.resources.medicationadministration import MedicationAdministration
+from fhir.resources.medication import Medication
+
+# File data on synapse after authentication
+# https://github.com/Sage-Bionetworks/synapsePythonClient?tab=readme-ov-file#store-a-file-to-synapse
+
+project_id = "OHSU_Breast_NOS"
+project_path = "./projects/HTAN/OHSU/Breast_NOS"
+
+SYSTEM_HTAN = 'https://data.humantumoratlas.org'
+NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, SYSTEM_HTAN)
+verbose = True
+
+cases_mapping = utils._read_json(str(Path(importlib.resources.files(
+    'fhirizer').parent / 'resources' / 'htan_resources' / 'cases.json')))
+
+biospecimens_mapping = utils._read_json(str(Path(importlib.resources.files(
+    'fhirizer').parent / 'resources' / 'htan_resources' / 'biospecimens.json')))
+
+files_mapping = utils._read_json(str(Path(importlib.resources.files(
+    'fhirizer').parent / 'resources' / 'htan_resources' / 'files.json')))
+
+# https://jen-dfci.github.io/htan_missing_manual/data_model/overview/
+
+# cases_mappings
+# https://data.humantumoratlas.org/standard/clinical
+# cases to Patient / ResearchSubject / ResearchStudy / Observation -> Condition / Medication / MedicationAdministration / Procedure / Encounter
+# 'HTAN Participant ID':  #NOTE:  HTAN ID associated with a patient based on HTAN ID SOP
+# 'Therapeutic Agents':  #NOTE: Some have multiple comma-separated Medication.ingredient
+cases_path = "".join([project_path, "/raw/cases/table_data.tsv"])
+cases = pd.read_csv(cases_path, sep="\t")
+
+# identifiers of the cases matrix/df
+patient_identifier_field = "HTAN Participant ID"
+
+
+def get_htan_field(match, field_maps, map_info):
+    for field, mappings in field_maps.items():
+        assert isinstance(mappings, list), f"HTAN resource mappings is not a list: {type(mappings)}, {mappings}"
+        for entry_map in mappings:
+            if entry_map[map_info] and entry_map[map_info] == match:
+                yield field
+                break
+
+
+components_fields = []
+for key in get_htan_field(match='Condition', field_maps=cases_mapping, map_info='focus'):
+    components_fields.append(key)
+    if verbose:
+        print(f"Observation focus -> condition - filed': {key}")
+
+observation_component_df = cases[[patient_identifier_field] + components_fields]
+
+
+for key in get_htan_field(match='Observation.component', field_maps=cases_mapping, map_info='fhir_map'):
+    if verbose:
+        print(f"field name mapped to Observation.component': {key}")
+
+# _component = utils.get_component(key=field, value=_component_value, component_type=utils.get_data_types(type(_component_value)), system=SYSTEM_HTAN)
+
+# format for onsetAge
+# "onsetAge": {
+#     "value": 23194,
+#     "unit": "days",
+#     "system": "http://unitsofmeasure.org",
+#     "code": "d"
+# }
+
+# biospecimens_mapping
+# biospecimens to Specimen / Observation -> Specimen
+# 'HTAN Parent ID': #NOTE: Parent could be another biospecimen or a research participant. # check for participant id for type of reference
+# 'Biospecimen Type': #NOTE: Doesn't seem informative
+biospecimens_path = "".join([project_path, "/raw/biospecimens/table_data.tsv"])
+biospecimens = pd.read_csv(biospecimens_path, sep="\t")
+biospecimen_identifier_field = "HTAN Biospecimen ID"
+
+# files_mapping
+# files to DocumentReference / Attachment / Observation -> DocumentReference
+files_metadata = pd.read_csv("".join([project_path, "/raw/files/table_data.tsv"]), sep="\t")
+files_drs_uri = pd.read_csv("".join([project_path, "/raw/files/cds_manifest.csv"]))
diff --git a/fhirizer/utils.py b/fhirizer/utils.py
index fe0cb60..71fcad0 100644
--- a/fhirizer/utils.py
+++ b/fhirizer/utils.py
@@ -1074,6 +1074,22 @@ def ncit2mondo(path):
         return data
 
 
+def get_data_types(data_type):
+    if data_type in ['int64', 'int32', 'int16']:
+        return 'int'
+    elif data_type in ['float64', 'float32', 'float16']:
+        return 'float'
+    elif data_type in ['string']:
+        return 'string'
+    elif data_type == 'bool':
+        return 'bool'
+    elif data_type in ['datetime64[ns]', 'timedelta64[ns]', 'period']:
+        return 'dateTime'
+    else:
+        print(f"New or Null Data type: {data_type}.")
+        return data_type
+
+
 def get_component(key, value=None, component_type=None, system="https://cadsr.cancer.gov/sample_laboratory_observation"):
     if component_type == 'string':
         value = {"valueString": value}
diff --git a/resources/htan_resources/biospecimens.json b/resources/htan_resources/biospecimens.json
new file mode 100644
index 0000000..9b2d88e
--- /dev/null
+++ b/resources/htan_resources/biospecimens.json
@@ -0,0 +1,406 @@
+{
+    "HTAN Biospecimen ID": [
+        {
+            "fhir_map": "Specimen.identifier",
+            "use": "official",
+            "focus": null
+        }
+    ],
+    "Atlas Name": [
+        {
+            "fhir_map": "ResearchStudy.name",
+            "focus": null
+        }
+    ],
+    "Source HTAN Biospecimen ID": [
+        {
+            "fhir_map": "Specimen.identifier",
+            "use": "secondary",
+            "focus": null
+        }
+    ],
+    "HTAN Parent ID": [
+        {
+            "fhir_map": "Specimen.parent",
+            "focus": null
+        }
+    ],
+    "Timepoint Label": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Collection Days from Index": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Adjacent Biospecimen IDs": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Biospecimen Type": [
+        {
+            "fhir_map": "Specimen.type",
+            "focus": null
+        }
+    ],
+    "Acquisition Method Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Fixative Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Storage Method": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Processing Days from Index": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Protocol Link": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Site Data Source": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Collection Media": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Mounting Medium": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Processing Location": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Histology Assessment By": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Histology Assessment Medium": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Preinvasive Morphology": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Tumor Infiltrating Lymphocytes": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Degree of Dysplasia": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Dysplasia Fraction": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Number Proliferating Cells": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Percent Eosinophil Infiltration": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Percent Granulocyte Infiltration": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Percent Inflam Infiltration": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Percent Lymphocyte Infiltration": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Percent Monocyte Infiltration": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Percent Necrosis": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Percent Neutrophil Infiltration": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Percent Normal Cells": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Percent Stromal Cells": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Percent Tumor Cells": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Percent Tumor Nuclei": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Fiducial Marker": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Slicing Method": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Lysis Buffer": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Method of Nucleic Acid Isolation": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Acquisition Method Other Specify": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Analyte Biospecimen Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Analyte Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Biospecimen Dimension 1": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Biospecimen Dimension 2": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Biospecimen Dimension 3": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Blood Biospecimen Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Bone Marrow Biospecimen Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Dimensions Unit": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Fixation Duration": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "HTAN Parent Biospecimen ID": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Histologic Morphology Code": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Ischemic Temperature": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Ischemic Time": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Other Acquisition Method": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Portion Weight": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Preservation Method": [
+        {
+            "fhir_map": "Specimen.processing.method",
+            "focus": null
+        }
+    ],
+    "Section Number in Sequence": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Section Thickness Value": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Sectioning Days from Index": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Shipping Condition Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Slide Charge Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Specimen Laterality": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Tissue Biospecimen Type": [
+        {
+            "fhir_map": "Specimen.type",
+            "focus": null
+        }
+    ],
+    "Total Volume": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Total Volume Unit": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Tumor Tissue Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ],
+    "Urine Biospecimen Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Specimen"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/resources/htan_resources/cases.json b/resources/htan_resources/cases.json
new file mode 100644
index 0000000..e514419
--- /dev/null
+++ b/resources/htan_resources/cases.json
@@ -0,0 +1,937 @@
+{
+    "HTAN Participant ID": [
+        {
+            "fhir_map": "Patient.identifier",
+            "use": "official",
+            "focus": null
+        }
+    ],
+    "Atlas Name": [
+        {
+            "fhir_map": "ResearchStudy.name",
+            "focus": null
+        }
+    ],
+    "Age at Diagnosis (years)": [
+        {
+            "fhir_map": "Condition.onsetAge",
+            "focus": null
+        }
+    ],
+    "Year of Diagnosis": [
+        {
+            "fhir_map": "Condition.recordedDate",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Primary Diagnosis": [
+        {
+            "fhir_map": "Condition.code",
+            "focus": null
+        }
+    ],
+    "Precancerous Condition Type": [
+        {
+            "fhir_map": "Condition.code",
+            "focus": null
+        }
+    ],
+    "Site of Resection or Biopsy": [
+        {
+            "fhir_map": "Procedure.bodySite",
+            "focus": null
+        }
+    ],
+    "Tissue or Organ of Origin": [
+        {
+            "fhir_map": "Condition.bodyStructure",
+            "focus": null
+        },
+        {
+            "fhir_map": "Condition.bodySite",
+            "focus": null
+        }
+    ],
+    "Morphology": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Tumor Grade": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        }
+    ],
+    "Progression or Recurrence": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Last Known Disease Status": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Days to Last Follow up": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Days to Last Known Disease Status": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Method of Diagnosis": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Prior Malignancy": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Prior Treatment": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Metastasis at Diagnosis": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Metastasis at Diagnosis Site": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "First Symptom Prior to Diagnosis": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Days to Diagnosis": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Percent Tumor Invasion": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Residual Disease": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Synchronous Malignancy": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Tumor Confined to Organ of Origin": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Tumor Focality": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Tumor Largest Dimension Diameter": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Gross Tumor Weight": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Breslow Thickness": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Vascular Invasion Present": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Vascular Invasion Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Anaplasia Present": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Anaplasia Present Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Laterality": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Perineural Invasion Present": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Lymphatic Invasion Present": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Lymph Nodes Positive": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Lymph Nodes Tested": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Peritoneal Fluid Cytological Status": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Classification of Tumor": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Best Overall Response": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Mitotic Count": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "AJCC Clinical M": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "AJCC Clinical N": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "AJCC Clinical Stage": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "AJCC Clinical T": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "AJCC Pathologic M": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "AJCC Pathologic N": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "AJCC Pathologic Stage": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "AJCC Pathologic T": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "AJCC Staging System Edition": [
+        {
+            "fhir_map": "Condition.stage.type",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.code",
+            "focus": "Condition"
+        }
+    ],
+    "Cog Neuroblastoma Risk Group": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Cog Rhabdomyosarcoma Risk Group": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Gleason Grade Group": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "Gleason Grade Tertiary": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "Gleason Patterns Percent": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "Greatest Tumor Dimension": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "IGCCCG Stage": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "INPC Grade": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "INPC Histologic Group": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "INRG Stage": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "INSS Stage": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "International Prognostic Index": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "IRS Group": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "IRS Stage": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "ISS Stage": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "Lymph Node Involved Site": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Margin Distance": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Margins Involved Site": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Medulloblastoma Molecular Classification": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Micropapillary Features": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Mitosis Karyorrhexis Index": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Non Nodal Regional Disease": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Non Nodal Tumor Deposits": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Ovarian Specimen Status": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Ovarian Surface Involvement": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Pregnant at Diagnosis": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Primary Gleason Grade": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "Secondary Gleason Grade": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "Supratentorial Localization": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Tumor Depth": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "WHO CNS Grade": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "WHO NTE Grade": [
+        {
+            "fhir_map": "Condition.stage.summary",
+            "focus": null
+        },
+        {
+            "fhir_map": "Observation.valueCodeableConcept",
+            "focus": "Condition"
+        }
+    ],
+    "Additional Topography": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Days to Progression": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Days to Progression Free": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Extent of Tumor Resection": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Mode of Cancer Detection": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "NCI Atlas Cancer Site": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        },
+        {
+            "fhir_map": "Organization",
+            "focus": null
+        }
+    ],
+    "Other Biopsy Resection Site": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        },
+        {
+            "fhir_map": "Organization",
+            "focus": null
+        }
+    ],
+    "Progression or Recurrence Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Satellite Metastasis Present Indicator": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Sentinel Lymph Node Count": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Sentinel Node Positive Assessment Count": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Topography Code": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Tumor Extranodal Extension Indicator": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Yes - Anaplasia Present": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Yes - Progression or Recurrence": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Yes - Vascular Invasion Present": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Ethnicity": [
+        {
+            "fhir_map": "Patient.extension.valueString",
+            "focus": null
+        }
+    ],
+    "Gender": [
+        {
+            "fhir_map": "Patient.extension.valueCode",
+            "focus": null
+        }
+    ],
+    "Race": [
+        {
+            "fhir_map": "Patient.extension.valueString",
+            "focus": null
+        }
+    ],
+    "Vital Status": [
+        {
+            "fhir_map": "Patient.deceasedBoolean",
+            "focus": null
+        }
+    ],
+    "Days to Birth": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Patient"
+        }
+    ],
+    "Country of Residence": [
+        {
+            "fhir_map": "Patient.address.country",
+            "focus": null
+        }
+    ],
+    "Age Is Obfuscated": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Patient"
+        }
+    ],
+    "Year Of Birth": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Patient"
+        }
+    ],
+    "Occupation Duration Years": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Patient"
+        }
+    ],
+    "Premature At Birth": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Patient"
+        }
+    ],
+    "Weeks Gestation at Birth": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Patient"
+        }
+    ],
+    "Cause of Death": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Patient"
+        }
+    ],
+    "Cause of Death Source": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Patient"
+        }
+    ],
+    "Days to Death": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Patient"
+        }
+    ],
+    "Dead": [
+        {
+            "fhir_map": "Patient.deceasedBoolean",
+            "focus": null
+        }
+    ],
+    "Year of Death": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Patient"
+        }
+    ],
+    "Treatment or Therapy": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Patient"
+        }
+    ],
+    "Treatment Type": [
+        {
+            "fhir_map": "MedicationAdministration.category",
+            "focus": null
+        }
+    ],
+    "Treatment Effect": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Treatment Outcome": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Days to Treatment End": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Treatment Anatomic Site": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Days to Treatment Start": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Initial Disease Status": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "Condition"
+        }
+    ],
+    "Regimen or Line of Therapy": [
+        {
+            "fhir_map": "MedicationAdministration.category",
+            "focus": null
+        }
+    ],
+    "Therapeutic Agents": [
+        {
+            "fhir_map": "MedicationAdministration.medication",
+            "focus": null
+        }
+    ],
+    "Treatment Intent Type": [
+        {
+            "fhir_map": "MedicationAdministration.category",
+            "focus": null
+        }
+    ],
+    "Chemo Concurrent to Radiation": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Number of Cycles": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Reason Treatment Ended": [
+        {
+            "fhir_map": "MedicationAdministration.statusReason",
+            "focus": null
+        }
+    ],
+    "Treatment Arm": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Treatment Dose": [
+        {
+            "fhir_map": "MedicationAdministration.dosage",
+            "focus": null
+        }
+    ],
+    "Treatment Dose Units": [
+        {
+            "fhir_map": "MedicationAdministration.dosage",
+            "focus": null
+        }
+    ],
+    "Treatment Effect Indicator": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Treatment Frequency": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Concomitant Medication Received Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Immunosuppression": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ],
+    "Prior Sites of Radiation": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "MedicationAdministration"
+        }
+    ]
+}
diff --git a/resources/htan_resources/files.json b/resources/htan_resources/files.json
new file mode 100644
index 0000000..8e31dec
--- /dev/null
+++ b/resources/htan_resources/files.json
@@ -0,0 +1,450 @@
+{
+    "Filename": [
+        {
+            "fhir_map": "DocumentReference.name",
+            "focus": null
+        },
+        {
+            "fhir_map": "DocumentReference.content.attachment.title",
+            "focus": null
+        }
+    ],
+    "Atlas Name": [
+        {
+            "fhir_map": "ResearchStudy.name",
+            "focus": null
+        }
+    ],
+    "Biospecimen": [
+        {
+            "fhir_map": "DocumentReference.basedOn",
+            "focus": null
+        }
+    ],
+    "Assay": [
+        {
+            "fhir_map": "DocumentReference.category",
+            "focus": null
+        }
+    ],
+    "Level": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Organ": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Treatment": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Diagnosis": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Data Access": [
+        {
+            "fhir_map": "DocumentReference.securityLabel",
+            "focus": "DocumentReference"
+        }
+    ],
+    "File Format": [
+        {
+            "fhir_map": "DocumentReference.content.contentType",
+            "focus": "DocumentReference"
+        }
+    ],
+    "HTAN Participant ID": [
+        {
+            "fhir_map": "Patient.identifier",
+            "use": "official",
+            "focus": null
+        }
+    ],
+    "HTAN Parent Biospecimen ID": [
+        {
+            "fhir_map": "Specimen.parent",
+            "focus": null
+        }
+    ],
+    "HTAN Data File ID": [
+        {
+            "fhir_map": "DocumentReference.identifier",
+            "use": "official",
+            "focus": null
+        }
+    ],
+    "Channel Metadata Filename": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Imaging Assay Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Protocol Link": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Softwareand Version": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Microscope": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Objective": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Nominal Magnification": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Lens NA": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Working Distance": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Working Distance Unit": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Immersion": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Pyramid": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Zstack": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Tseries": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Passed QC": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Comment": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "FO Vnumber": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "FOVX": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "FOVX Unit": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "FOVY": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "FOVY Unit": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Frame Averaging": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Image ID": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Dimension Order": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Physical Size X": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Physical Size X Unit": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Physical Size Y": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Physical Size Y Unit": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Physical Size Z": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Physical Size Z Unit": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Pixels Big Endian": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Plane Count": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Size C": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Size T": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Size X": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Size Y": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Size Z": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Pixel Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "MERFISH Positions File": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "MERFISH Codebook File": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Synapse Id": [
+        {
+            "fhir_map": "DocumentReference.identifier",
+            "use": "secondary",
+            "focus": null
+        }
+    ],
+    "Atlasid": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Data File ID": [
+        {
+            "fhir_map": "DocumentReference.identifier",
+            "use": "official",
+            "focus": null
+        }
+    ],
+    "Participant ID": [
+        {
+            "fhir_map": "Patient.identifier",
+            "use": "official",
+            "focus": null
+        }
+    ],
+    "Parent Biospecimen ID": [
+        {
+            "fhir_map": "Specimen.parent",
+            "focus": null
+        }
+    ],
+    "Publication Ids": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Is Raw Sequencing": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Release Version": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "HTAN Parent Data File ID": [
+        {
+            "fhir_map": "DocumentReference.identifier",
+            "use": "secondary",
+            "focus": null
+        }
+    ],
+    "Imaging Segmentation Data Type": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Parameterfile": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Commit SHA": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Imaging Object Class": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Numberof Objects": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Parent Data File ID": [
+        {
+            "fhir_map": "DocumentReference.relatesTo.target",
+            "focus": null
+        }
+    ],
+    "HTAN Parent Channel Metadata ID": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Numberof Features": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Imaging Summary Statistic": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "Metadata": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ],
+    "View": [
+        {
+            "fhir_map": "Observation.component",
+            "focus": "DocumentReference"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 9bf1a93..9190e4c 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 
-__version__ = '2.0.3'
+__version__ = '2.1.3'
 
 setup(
     name='fhirizer',

From c12dabf6e2ff3e26377610a415261f0efb8b1cb3 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Tue, 1 Oct 2024 09:53:02 -0700
Subject: [PATCH 02/24] updated readme

---
 README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index d4eec77..b6882a1 100644
--- a/README.md
+++ b/README.md
@@ -154,15 +154,15 @@ fhirizer/
 |   |            |-- data/
 |   |            └── META/
 |   └── HTAN/ 
-|   |     └── OHSU/
-|   |           └── Breast_NOS/
-|   |                 |-- raw/ 
-|   |                 |    |--  files/
-|   |                 |    |--  biospecimens/
-|   |                 |    └──  cases/
-|   |                 └── META/
-|   |           
-|   |           
+|         └── OHSU/
+|               └── Breast_NOS/
+|                     |-- raw/ 
+|                     |    |--  files/
+|                     |    |--  biospecimens/
+|                     |    └──  cases/
+|                     └── META/
+|              
+|              
 |--README.md
 └── setup.py
 ```

From ba5e80cac1931638253eeb69bfc24cd9659eb7dd Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Wed, 2 Oct 2024 08:28:26 -0700
Subject: [PATCH 03/24] initial htan class and patient transformer

---
 fhirizer/htan2fhir.py | 281 +++++++++++++++++++++++++++++++-----------
 fhirizer/utils.py     |   8 +-
 2 files changed, 212 insertions(+), 77 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index 19ad40a..e1a376a 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -11,11 +11,13 @@
 from pathlib import Path
 import importlib.resources
 from uuid import uuid3, NAMESPACE_DNS
+from typing import Any
 
 from fhir.resources.reference import Reference
 from fhir.resources.identifier import Identifier
 from fhir.resources.codeableconcept import CodeableConcept
 from fhir.resources.patient import Patient
+from fhir.resources.address import Address
 from fhir.resources.researchstudy import ResearchStudy
 from fhir.resources.researchsubject import ResearchSubject
 from fhir.resources.specimen import Specimen, SpecimenProcessing, SpecimenCollection
@@ -30,77 +32,210 @@
 # File data on synapse after authentication
 # https://github.com/Sage-Bionetworks/synapsePythonClient?tab=readme-ov-file#store-a-file-to-synapse
 
-project_id = "OHSU_Breast_NOS"
-project_path = "./projects/HTAN/OHSU/Breast_NOS"
 
-SYSTEM_HTAN = 'https://data.humantumoratlas.org'
-NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, SYSTEM_HTAN)
-verbose = True
-
-cases_mapping = utils._read_json(str(Path(importlib.resources.files(
-    'fhirizer').parent / 'resources' / 'htan_resources' / 'cases.json')))
-
-biospecimens_mapping = utils._read_json(str(Path(importlib.resources.files(
-    'fhirizer').parent / 'resources' / 'htan_resources' / 'biospecimens.json')))
-
-files_mapping = utils._read_json(str(Path(importlib.resources.files(
-    'fhirizer').parent / 'resources' / 'htan_resources' / 'files.json')))
-
-# https://jen-dfci.github.io/htan_missing_manual/data_model/overview/
-
-# cases_mappings
-# https://data.humantumoratlas.org/standard/clinical
-# cases to Patient / ResearchSubject / ResearchStudy / Observation -> Condition / Medication / MedicationAdministration / Procedure / Encounter
-# 'HTAN Participant ID':  #NOTE:  HTAN ID associated with a patient based on HTAN ID SOP
-# 'Therapeutic Agents':  #NOTE: Some have multiple comma-separated Medication.ingredient
-cases_path = "".join([project_path, "/raw/cases/table_data.tsv"])
-cases = pd.read_csv(cases_path, sep="\t")
-
-# identifiers of the cases matrix/df
-patient_identifier_field = "HTAN Participant ID"
-
-
-def get_htan_field(match, field_maps, map_info):
-    for field, mappings in field_maps.items():
-        assert isinstance(mappings, list), f"HTAN resource mappings is not a list: {type(mappings)}, {mappings}"
-        for entry_map in mappings:
-            if entry_map[map_info] and entry_map[map_info] == match:
-                yield field
-                break
-
-
-components_fields = []
-for key in get_htan_field(match='Condition', field_maps=cases_mapping, map_info='focus'):
-    components_fields.append(key)
-    if verbose:
-        print(f"Observation focus -> condition - filed': {key}")
-
-observation_component_df = cases[[patient_identifier_field] + components_fields]
-
-
-for key in get_htan_field(match='Observation.component', field_maps=cases_mapping, map_info='fhir_map'):
-    if verbose:
-        print(f"field name mapped to Observation.component': {key}")
-
-# _component = utils.get_component(key=field, value=_component_value, component_type=utils.get_data_types(type(_component_value)), system=SYSTEM_HTAN)
-
-# format for onsetAge
-# "onsetAge": {
-#     "value": 23194,
-#     "unit": "days",
-#     "system": "http://unitsofmeasure.org",
-#     "code": "d"
-# }
-
-# biospecimens_mapping
-# biospecimens to Specimen / Observation -> Specimen
-# 'HTAN Parent ID': #NOTE: Parent could be another biospecimen or a research participant. # check for participant id for type of reference
-# 'Biospecimen Type': #NOTE: Doesn't seem informative
-biospecimens_path = "".join([project_path, "/raw/biospecimens/table_data.tsv"])
-biospecimens = pd.read_csv(biospecimens_path, sep="\t")
-biospecimen_identifier_field = "HTAN Biospecimen ID"
-
-# files_mapping
-# files to DocumentReference / Attachment / Observation -> DocumentReference
-files_metadata = pd.read_csv("".join([project_path, "/raw/files/table_data.tsv"]), sep="\t")
-files_drs_uri = pd.read_csv("".join([project_path, "/raw/files/cds_manifest.csv"]))
+class HTANTransformer:
+    def __init__(self, subprogram_name: str, project_id: str, verbose: bool):
+        self.mint_id = utils.mint_id
+        self._mint_id = utils._mint_id
+        self.get_data_type = utils.get_data_types
+        self.get_component = utils.get_component
+        self.fhir_ndjson = utils.fhir_ndjson
+        self.subprogram_name = subprogram_name
+        self.project_id = project_id
+        self.verbose = verbose
+        self.SYSTEM_HTAN = 'https://data.humantumoratlas.org'
+        self.NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, self.SYSTEM_HTAN)
+        self.project_id = project_id
+        self.read_json = utils._read_json
+
+        self.project_path = str(
+            Path(importlib.resources.files('fhirizer').parent / 'projects' / 'HTAN' / subprogram_name / project_id))
+        assert Path(self.project_path).is_dir(), f"Path {self.project_path} is not a valid directory path."
+
+        self.cases_path = str(
+            Path(importlib.resources.files('fhirizer').parent / 'resources' / 'htan_resources' / 'cases.json'))
+        assert Path(self.cases_path).is_file(), f"Path {self.cases_path} does not exist."
+
+        self.biospecimens_path = str(
+            Path(importlib.resources.files('fhirizer').parent / 'resources' / 'htan_resources' / 'biospecimens.json'))
+        assert Path(self.biospecimens_path).is_file(), f"Path {self.biospecimens_path} does not exist."
+
+        self.files_path = str(
+            Path(importlib.resources.files('fhirizer').parent / 'resources' / 'htan_resources' / 'files.json'))
+        assert Path(self.files_path).is_file(), f"Path {self.files_path} does not exist."
+
+        self.cases_mappings = self.get_cases_mappings
+
+        # cases_mappings
+        # https://data.humantumoratlas.org/standard/clinical
+        # cases to Patient / ResearchSubject / ResearchStudy / Observation -> Condition / Medication / MedicationAdministration / Procedure / Encounter
+        # 'HTAN Participant ID':  #NOTE:  HTAN ID associated with a patient based on HTAN ID SOP
+        # 'Therapeutic Agents':  #NOTE: Some have multiple comma-separated Medication.ingredient
+        self.cases_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/cases/table_data.tsv")
+        assert self.cases_table_data_path.is_file(), f"Path {self.cases_table_data_path} is not a valid file path."
+        self.cases = self.get_dataframe(self.cases_table_data_path, sep="\t")
+        self.patient_identifier_field = "HTAN Participant ID" # identifiers of the cases matrix/df
+
+        self.biospecimen_mappings = self.get_biospecimen_mappings
+
+        # biospecimens_mapping
+        # biospecimens to Specimen / Observation -> Specimen
+        # 'HTAN Parent ID': #NOTE: Parent could be another biospecimen or a research participant. # check for participant id for type of reference
+        # 'Biospecimen Type': #NOTE: Doesn't seem informative
+        self.biospecimens_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath(
+            "./raw/biospecimens/table_data.tsv")
+        assert self.biospecimens_table_data_path.is_file(), f"Path {self.biospecimens_table_data_path} is not a valid file path."
+        self.biospecimens = self.get_dataframe(self.biospecimens_table_data_path, sep="\t")
+        self.biospecimen_identifier_field = "HTAN Biospecimen ID"
+
+        self.files_mappings = self.get_files_mappings
+
+        # files_mapping
+        # files to DocumentReference / Attachment / Observation -> DocumentReference
+
+        self.files_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/files/table_data.tsv")
+        self.files_drs_uri_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/files/cds_manifest.csv")
+        assert self.files_table_data_path.is_file(), f"Path {self.files_table_data_path} is not a valid file path."
+        assert self.files_drs_uri_path.is_file(), f"Path {self.files_drs_uri_path} is not a valid file path."
+
+        self.files = self.get_dataframe(self.files_table_data_path, sep="\t")
+        self.files_drs_uri = pd.read_csv(self.files_drs_uri_path, sep=",")
+
+        self.patient_demographics = self.get_patient_demographics()
+
+    def get_cases_mappings(self) -> dict:
+        """HTAN cases FHIR mapping"""
+        return self.read_json(self.cases_path)
+
+    def get_biospecimen_mappings(self) -> dict:
+        """HTAN biospesimens FHIR mapping"""
+        return self.read_json(self.biospecimens_path)
+
+    def get_files_mappings(self) -> dict:
+        """HTAN files FHIR mapping"""
+        return self.read_json(self.files_path)
+
+    @staticmethod
+    def get_dataframe(_path, sep) -> pd.DataFrame:
+        """Returns a Pandas DataFrame with lower-case and inflection.underscore columns for standard UI input"""
+        _data = pd.read_csv(_path, sep=sep)
+        # _data.columns = _data.columns.to_series().apply(lambda x: inflection.underscore(inflection.parameterize(x)))
+        return _data
+
+    def get_patient_demographics(self) -> pd.DataFrame:
+        """HTAN cases table_data.tsv data with Patient FHIR demographics mappings column/field match"""
+        field_list = []
+        for field in self.get_htan_mapping(match='Patient', field_maps=self.cases_mappings(), map_info='fhir_map', fetch='field'):
+            field_list.append(field)
+            if self.verbose:
+                print(f"field name': {field}")
+
+        patient_demographics = self.cases[field_list]
+        return patient_demographics
+
+    @staticmethod
+    def get_htan_mapping(match, field_maps, map_info, fetch):
+        """Yields FHIR HTAN maps from HTAN field or FHIR mapping string"""
+        for field, mappings in field_maps.items():
+            assert isinstance(mappings, list), f"HTAN resource mappings is not a list: {type(mappings)}, {mappings}"
+            for entry_map in mappings:
+                if entry_map[map_info] and match in entry_map[map_info]:
+                    if fetch == "field":
+                        yield field
+                        break
+                    elif fetch == "mapping":
+                        yield entry_map
+                        break
+
+    @staticmethod
+    def get_fields_by_fhir_map(mapping_data, fhir_mapping=None):
+        """
+        Yields the field(s) associated with a specific HTAN FHIR map or all HTAN FHIR maps
+
+        Return: Yields the field, FHIR map, identifier use, and focus.
+            example:
+                for field, fhir_map, use, focus in get_fields_by_fhir_map(cases_mapping, "Observation.component"):
+                    print(f"Field: {field}, FHIR Map: {fhir_map}, Identifier use: {use}, Focus: {focus}")
+        """
+        for _field, mappings in mapping_data.items():
+            for mapping in mappings:
+                _current_fhir_map = mapping["fhir_map"]
+                _focus = mapping.get("focus", None)
+                _use = mapping.get("use", None)
+
+                if fhir_mapping is None or _current_fhir_map == fhir_mapping:
+                    yield _field, _current_fhir_map, _use, _focus
+
+    @staticmethod
+    def get_fhir_maps_by_field(mapping_data, field_name=None):
+        """
+        Yields the FHIR map(s) associated with a specific HTAN field or all HTAN FHIR maps
+
+        Return: Yields the field, FHIR map, identifier use, and focus.
+            example use:
+                for field, fhir_map, use, focus in get_fhir_maps_by_field(cases_mapping, "Year of Diagnosis"):
+                    print(f"Field: {field}, FHIR Map: {fhir_map}, Identifier use: {use}, Focus: {focus}")
+        """
+        for _field, mappings in mapping_data.items():
+            if field_name is None or _field == field_name:
+                for mapping in mappings:
+                    _fhir_map = mapping["fhir_map"]
+                    _focus = mapping.get("focus", None)
+                    _use = mapping.get("use", None)
+                    yield _field, _fhir_map, _use, _focus
+
+
+class PatientTransformer(HTANTransformer):
+    def __init__(self, *args: Any, **kwargs: Any):
+        super().__init__(**kwargs)
+        self.cases_mapping = self.cases_mappings
+        self.NAMESPACE_HTAN = self.NAMESPACE_HTAN
+
+    def create_patient(self, _row: pd.Series) -> Patient:
+        """Transform HTAN case demographics to FHIR Patient"""
+        use = None
+        for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(self.cases_mapping(), "Patient.identifier"):
+            use = _use
+        assert use, f"Patient.identifier use is not defined in ./resources/HTAN/cases.json mappings."
+
+        patient_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "value": _row['HTAN Participant ID'], "use": use})
+        patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id,
+                                   namespace=self.NAMESPACE_HTAN)
+
+        deceasedBoolean_fields = []
+        for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(self.cases_mapping(), "Patient.deceasedBoolean"):
+            deceasedBoolean_fields.append(_field)
+        assert deceasedBoolean_fields, f"Patient.deceasedBoolean has no fields defined in ./resources/HTAN/cases.json mappings."
+
+
+        vital_status = _row[deceasedBoolean_fields].dropna().unique().any()
+        deceasedBoolean = {"Dead": True}.get(vital_status, False if vital_status else None)
+
+        # TODO: us-core-ethnicity and race resource
+        ethnicity = _row.get("Ethnicity")
+        race = _row.get("Race")
+
+        address_country = _row.get("Country of Residence")
+        address = Address(**{"country": address_country})
+
+        return Patient(**{"id": patient_id,
+                          "identifier": [patient_identifier],
+                          "deceasedBoolean": deceasedBoolean,
+                          "extension": [{"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity",
+                                         "valueString": ethnicity},
+                                        {"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race",
+                                         "valueString": race}
+                                        ],
+                          "address": [address]})
+
+
+transformer = HTANTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False)
+patient_transformer = PatientTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False)
+patient_demographics_df = transformer.patient_demographics
+
+patients = []
+for index, row in patient_demographics_df.iterrows():
+    patient = patient_transformer.create_patient(_row=row)
+    if patient:
+        patients.append(orjson.loads(patient.json()))
+        print(f"HTAN FHIR Patient: {patient.json()}")
\ No newline at end of file
diff --git a/fhirizer/utils.py b/fhirizer/utils.py
index 71fcad0..92f1744 100644
--- a/fhirizer/utils.py
+++ b/fhirizer/utils.py
@@ -1075,15 +1075,15 @@ def ncit2mondo(path):
 
 
 def get_data_types(data_type):
-    if data_type in ['int64', 'int32', 'int16']:
+    if data_type in ['int64', 'int32', 'int16', 'int']:
         return 'int'
-    elif data_type in ['float64', 'float32', 'float16']:
+    elif data_type in ['float64', 'float32', 'float16', 'float']:
         return 'float'
-    elif data_type in ['string']:
+    elif data_type in ['str', 'string']:
         return 'string'
     elif data_type == 'bool':
         return 'bool'
-    elif data_type in ['datetime64[ns]', 'timedelta64[ns]', 'period']:
+    elif data_type in ['datetime64[ns]', 'timedelta64[ns]', 'period', 'datetime', 'date']:
         return 'dateTime'
     else:
         print(f"New or Null Data type: {data_type}.")

From ccf849e924d89ba11b4aefc5719091188c3b7b64 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Wed, 2 Oct 2024 09:37:31 -0700
Subject: [PATCH 04/24] patient observation

---
 fhirizer/htan2fhir.py | 63 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 3 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index e1a376a..d209512 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -190,6 +190,9 @@ def __init__(self, *args: Any, **kwargs: Any):
         super().__init__(**kwargs)
         self.cases_mapping = self.cases_mappings
         self.NAMESPACE_HTAN = self.NAMESPACE_HTAN
+        self.get_data_types = utils.get_data_types
+        self.get_component = self.get_component
+        self.get_fields_by_fhir_map = self.get_fields_by_fhir_map
 
     def create_patient(self, _row: pd.Series) -> Patient:
         """Transform HTAN case demographics to FHIR Patient"""
@@ -228,14 +231,68 @@ def create_patient(self, _row: pd.Series) -> Patient:
                                         ],
                           "address": [address]})
 
+    def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation:
+        patient_observation_fields = []
+        for field, fhir_map, use, focus in self.get_fields_by_fhir_map(transformer.cases_mappings(), "Observation.component"):
+            if focus == "Patient":
+                patient_observation_fields.append(field)
+
+        if patient_observation_fields:
+            _obervation_row = _row[patient_observation_fields]
+
+        components = []
+        for key, value in _obervation_row.to_dict().items():
+            if key != 'HTAN Participant ID':
+                if isinstance(value, float) and not pd.isna(value) and ("Year" in key or "Day" in key or "year" in key or "day" in key):
+                    value = int(value)
+                    _component = self.get_component(key=key, value=value, component_type=self.get_data_types(type(value).__name__), system=self.SYSTEM_HTAN)
+                    components.append(_component)
+
+        observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.id})
+        observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation",
+                                      project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+
+        return Observation(**{"id": observation_id,
+                              "identifier": [observation_identifier],
+                              "status": "final",
+                              "category": [
+                                  {
+                                      "coding": [
+                                          {
+                                              "system": "http://terminology.hl7.org/CodeSystem/observation-category",
+                                              "code": "exam",
+                                              "display": "exam"
+                                          }
+                                      ],
+                                      "text": "Exam"
+                                  }
+                              ],
+                              "code": {
+                                  "coding": [
+                                      {
+                                          "system": "http://loinc.org",
+                                          "code": "52460-3", # TODO: may need to change to be more specific
+                                          "display": "patient information"
+                                      }
+                                  ],
+                                  "text": "Patient Information"
+                              },
+                              "focus": [Reference(**{"reference": f"Patient/{patient.id}"})],
+                              "subject": Reference(**{"reference": f"Patient/{patient.id}"}),
+                              "component": components})
+
 
 transformer = HTANTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False)
 patient_transformer = PatientTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False)
 patient_demographics_df = transformer.patient_demographics
+cases = transformer.cases
 
 patients = []
-for index, row in patient_demographics_df.iterrows():
-    patient = patient_transformer.create_patient(_row=row)
+for index, row in cases.iterrows():
+    patient_row = cases.iloc[index][patient_demographics_df.columns]
+    patient = patient_transformer.create_patient(_row=patient_row)
+    patient_obs = patient_transformer.patient_observation(patient=patient, _row=row)
     if patient:
         patients.append(orjson.loads(patient.json()))
-        print(f"HTAN FHIR Patient: {patient.json()}")
\ No newline at end of file
+        print(f"HTAN FHIR Patient: {patient.json()}")
+        print(f"HTAN FHIR Patient Observation: {patient_obs.json()}")

From af3d61d0ccb5e738d5a5e2f655033cbf8d8607d9 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Wed, 2 Oct 2024 13:33:10 -0700
Subject: [PATCH 05/24] researchstudy and researchsubject identifiers

---
 fhirizer/entity2fhir.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fhirizer/entity2fhir.py b/fhirizer/entity2fhir.py
index 3ce22c1..a97465e 100644
--- a/fhirizer/entity2fhir.py
+++ b/fhirizer/entity2fhir.py
@@ -84,7 +84,7 @@ def assign_fhir_for_project(project, disease_types=disease_types):
         pr_ident = Identifier(**{"system": "".join(["https://gdc.cancer.gov/", "program_id"]),
                                  "value": project['ResearchStudy']['ResearchStudy.id']})
         pl.append(pr_ident)
-
+        rs.identifier = [pr_ident]
         rs.id = utils.mint_id(
             identifier=pr_ident,
             resource_type="ResearchStudy",
@@ -94,6 +94,7 @@ def assign_fhir_for_project(project, disease_types=disease_types):
     else:
         p_ident = Identifier(**{"system": "".join(["https://gdc.cancer.gov/", "project_id"]),
                                 "value": project['ResearchStudy.id']})
+        rs.identifier = [p_ident]
         rs.id = utils.mint_id(
             identifier=p_ident,
             resource_type="ResearchStudy",
@@ -157,6 +158,7 @@ def assign_fhir_for_project(project, disease_types=disease_types):
 
     ref = Reference(**{"reference": "/".join(["ResearchStudy", rs_parent.id])})
     rs.partOf = [ref]
+
     #  condition -- subject --> patient <--subject-- researchsubject -- study --> researchstudy -- partOf --> researchstudy
 
     return {'ResearchStudy': rs.json(), "ResearchStudy.partOf": rs_parent.json(), 'ResearchStudy_obj': rs,
@@ -394,6 +396,7 @@ def assign_fhir_for_case(case, disease_types=disease_types, primary_sites=primar
     research_subject.status = "active"
     research_subject.study = study_ref
     research_subject.subject = subject_ref
+    research_subject.identifier = [patient_id_identifier]
     research_subject.id = utils.mint_id(
         identifier=patient_id_identifier,
         resource_type="ResearchSubject",
@@ -1890,6 +1893,7 @@ def assign_fhir_for_file(file):
         for case in file['cases']:
             patient_id_identifier = Identifier.construct()
             patient_id_identifier.value = case['Patient.id']
+            patient_id_identifier.use = "official"
             patient_id_identifier.system = "".join(["https://gdc.cancer.gov/", "case_id"])
 
             patient_id = utils.mint_id(identifier=patient_id_identifier, resource_type="Patient", project_id=project_id,

From 4daa0693f9c458c064b6900a9a86b2c254fa6f08 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Wed, 2 Oct 2024 14:01:53 -0700
Subject: [PATCH 06/24] researchstudy - researchsubject - patient for list a of
 htan atlas names

---
 README.md             |  13 ++---
 fhirizer/htan2fhir.py | 122 +++++++++++++++++++++++++++++++++---------
 2 files changed, 104 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index b6882a1..3968812 100644
--- a/README.md
+++ b/README.md
@@ -155,12 +155,13 @@ fhirizer/
 |   |            └── META/
 |   └── HTAN/ 
 |         └── OHSU/
-|               └── Breast_NOS/
-|                     |-- raw/ 
-|                     |    |--  files/
-|                     |    |--  biospecimens/
-|                     |    └──  cases/
-|                     └── META/
+|               |-- raw/ 
+|               |    |--  files/
+|               |    |      |-- table_data.tsv
+|               |    |      └── cds_manifest.csv
+|               |    |--  biospecimens/table_data.tsv
+|               |    └──  cases/table_data.tsv
+|               └── META/
 |              
 |              
 |--README.md
diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index d209512..2d68c9c 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -34,22 +34,24 @@
 
 
 class HTANTransformer:
-    def __init__(self, subprogram_name: str, project_id: str, verbose: bool):
+    def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
         self.mint_id = utils.mint_id
         self._mint_id = utils._mint_id
         self.get_data_type = utils.get_data_types
         self.get_component = utils.get_component
         self.fhir_ndjson = utils.fhir_ndjson
         self.subprogram_name = subprogram_name
-        self.project_id = project_id
+        self.project_id = subprogram_name # incase there will be more granular project/program relations
+        assert Path(out_dir).is_dir(), f"Path to out_dir {out_dir} is not a directory."
+        self.out_dir = out_dir
         self.verbose = verbose
         self.SYSTEM_HTAN = 'https://data.humantumoratlas.org'
         self.NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, self.SYSTEM_HTAN)
-        self.project_id = project_id
         self.read_json = utils._read_json
+        self.fhir_ndjson = utils.fhir_ndjson
 
         self.project_path = str(
-            Path(importlib.resources.files('fhirizer').parent / 'projects' / 'HTAN' / subprogram_name / project_id))
+            Path(importlib.resources.files('fhirizer').parent / 'projects' / 'HTAN' / subprogram_name))
         assert Path(self.project_path).is_dir(), f"Path {self.project_path} is not a valid directory path."
 
         self.cases_path = str(
@@ -71,7 +73,7 @@ def __init__(self, subprogram_name: str, project_id: str, verbose: bool):
         # cases to Patient / ResearchSubject / ResearchStudy / Observation -> Condition / Medication / MedicationAdministration / Procedure / Encounter
         # 'HTAN Participant ID':  #NOTE:  HTAN ID associated with a patient based on HTAN ID SOP
         # 'Therapeutic Agents':  #NOTE: Some have multiple comma-separated Medication.ingredient
-        self.cases_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/cases/table_data.tsv")
+        self.cases_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/cases/table_data.tsv")
         assert self.cases_table_data_path.is_file(), f"Path {self.cases_table_data_path} is not a valid file path."
         self.cases = self.get_dataframe(self.cases_table_data_path, sep="\t")
         self.patient_identifier_field = "HTAN Participant ID" # identifiers of the cases matrix/df
@@ -82,7 +84,7 @@ def __init__(self, subprogram_name: str, project_id: str, verbose: bool):
         # biospecimens to Specimen / Observation -> Specimen
         # 'HTAN Parent ID': #NOTE: Parent could be another biospecimen or a research participant. # check for participant id for type of reference
         # 'Biospecimen Type': #NOTE: Doesn't seem informative
-        self.biospecimens_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath(
+        self.biospecimens_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath(
             "./raw/biospecimens/table_data.tsv")
         assert self.biospecimens_table_data_path.is_file(), f"Path {self.biospecimens_table_data_path} is not a valid file path."
         self.biospecimens = self.get_dataframe(self.biospecimens_table_data_path, sep="\t")
@@ -93,8 +95,8 @@ def __init__(self, subprogram_name: str, project_id: str, verbose: bool):
         # files_mapping
         # files to DocumentReference / Attachment / Observation -> DocumentReference
 
-        self.files_table_data_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/files/table_data.tsv")
-        self.files_drs_uri_path = Path(Path(self.project_path).parent / self.project_id).joinpath("./raw/files/cds_manifest.csv")
+        self.files_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/files/table_data.tsv")
+        self.files_drs_uri_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/files/cds_manifest.csv")
         assert self.files_table_data_path.is_file(), f"Path {self.files_table_data_path} is not a valid file path."
         assert self.files_drs_uri_path.is_file(), f"Path {self.files_drs_uri_path} is not a valid file path."
 
@@ -183,6 +185,29 @@ def get_fhir_maps_by_field(mapping_data, field_name=None):
                     _focus = mapping.get("focus", None)
                     _use = mapping.get("use", None)
                     yield _field, _fhir_map, _use, _focus
+    @staticmethod
+    def decipher_htan_id(_id) -> dict:
+        """
+        <participant_id> ::= <htan_center_id>_integer
+        <derivative_entity_id>	::= <participant_id>_integer
+        wild-card string ex. '0000' is used for the same file derived from multiple participants
+        substring 'EXT' is used for external participants
+        """
+        deciphered_id = {}
+        _id_substrings = _id.split("_")
+        participant_id = "_".join([_id_substrings[0],_id_substrings[1]])
+        if 'EXT' not in _id_substrings[1] or '0000' not in _id_substrings[1]:
+            deciphered_id = {"participant_id": participant_id, "subsets": _id_substrings}
+        else:
+            participant_id = "_".join([_id_substrings[0], _id_substrings[1], _id_substrings[2]])
+            deciphered_id = {"participant_id": participant_id, "subsets": _id_substrings}
+        return deciphered_id
+
+    def write_ndjson(self, entities):
+        resource_type = entities[0].resource_type
+        entities = [orjson.loads(entity.json()) for entity in entities]
+        entities = list({v['id']: v for v in entities}.values())
+        utils.fhir_ndjson(entities, "".join([self.out_dir, "/", resource_type, ".ndjson"]))
 
 
 class PatientTransformer(HTANTransformer):
@@ -219,7 +244,7 @@ def create_patient(self, _row: pd.Series) -> Patient:
         race = _row.get("Race")
 
         address_country = _row.get("Country of Residence")
-        address = Address(**{"country": address_country})
+        address = [Address(**{"country": address_country})] if not pd.isna(address_country) else []
 
         return Patient(**{"id": patient_id,
                           "identifier": [patient_identifier],
@@ -229,7 +254,7 @@ def create_patient(self, _row: pd.Series) -> Patient:
                                         {"url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race",
                                          "valueString": race}
                                         ],
-                          "address": [address]})
+                          "address": address})
 
     def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation:
         patient_observation_fields = []
@@ -281,18 +306,65 @@ def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation:
                               "subject": Reference(**{"reference": f"Patient/{patient.id}"}),
                               "component": components})
 
-
-transformer = HTANTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False)
-patient_transformer = PatientTransformer(subprogram_name="OHSU", project_id="Breast_NOS", verbose=False)
-patient_demographics_df = transformer.patient_demographics
-cases = transformer.cases
-
-patients = []
-for index, row in cases.iterrows():
-    patient_row = cases.iloc[index][patient_demographics_df.columns]
-    patient = patient_transformer.create_patient(_row=patient_row)
-    patient_obs = patient_transformer.patient_observation(patient=patient, _row=row)
-    if patient:
-        patients.append(orjson.loads(patient.json()))
-        print(f"HTAN FHIR Patient: {patient.json()}")
-        print(f"HTAN FHIR Patient Observation: {patient_obs.json()}")
+    def create_researchstudy(self, _row: pd.Series) -> ResearchStudy:
+        study_field = None
+        for field, fhir_map, use, focus in self.get_fields_by_fhir_map(self.cases_mappings(), "ResearchStudy.name"):
+            study_field = field
+        study_name = _row.get(study_field)
+        researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": study_name})
+        researchstudy_id = self.mint_id(identifier=researchstudy_identifier, resource_type="ResearchStudy",
+                                        project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+        return ResearchStudy(**{"id": researchstudy_id,
+                                "identifier": [researchstudy_identifier],
+                                "name": study_name,
+                                "status": "open"}) # TODO: add "condition" snomed id
+
+    def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> ResearchSubject:
+        researchsubject_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.identifier[0].value})
+        researchsubject_id = self.mint_id(identifier=researchsubject_identifier, resource_type="ResearchSubject",
+                                          project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+        return ResearchSubject(**{"id": researchsubject_id,
+                                  "identifier": [researchsubject_identifier],
+                                  "status": "active",
+                                  "subject": Reference(**{"reference": f"Patient/{patient.id}"}),
+                                  "study": Reference(**{"reference": f"ResearchStudy/{study.id}"})})
+
+
+atlas_name = ["OHSU", "DFCI", "WUSTL"]
+for name in atlas_name:
+
+    transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
+    patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
+
+    patient_demographics_df = transformer.patient_demographics
+    cases = transformer.cases
+
+    patients = []
+    research_studies = []
+    research_subjects = []
+    for index, row in cases.iterrows():
+
+        research_study = patient_transformer.create_researchstudy(_row=row)
+
+        if research_study:
+            research_studies.append(research_study)
+
+            patient_row = cases.iloc[index][patient_demographics_df.columns]
+            patient = patient_transformer.create_patient(_row=patient_row)
+            patient_obs = patient_transformer.patient_observation(patient=patient, _row=row)
+            if patient:
+                patients.append(patient)
+                print(f"HTAN FHIR Patient: {patient.json()}")
+                print(f"HTAN FHIR Patient Observation: {patient_obs.json()}")
+
+                research_subject = patient_transformer.create_researchsubject(patient, research_study)
+                if research_subject:
+                    research_subjects.append(research_subject)
+
+    transformer.write_ndjson(research_subjects)
+    transformer.write_ndjson(research_studies)
+    transformer.write_ndjson(patients)
+
+    # participant ids from specimen identifiers
+    # transformer.decipher_htan_id(transformer.biospecimens["HTAN Biospecimen ID"][0])
+    # transformer.decipher_htan_id(transformer.cases["HTAN Participant ID"][0])
\ No newline at end of file

From 8b276b650b4730b0fd31345573fadd5752f45495 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Thu, 3 Oct 2024 06:32:56 -0700
Subject: [PATCH 07/24] transform all available HTAN atlases

---
 fhirizer/htan2fhir.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index 2d68c9c..8b05501 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -329,8 +329,9 @@ def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> Rese
                                   "subject": Reference(**{"reference": f"Patient/{patient.id}"}),
                                   "study": Reference(**{"reference": f"ResearchStudy/{study.id}"})})
 
-
-atlas_name = ["OHSU", "DFCI", "WUSTL"]
+# 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
+# 12/14 total Atlas
+atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"]
 for name in atlas_name:
 
     transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)

From 122a220d81c715f9795066b6d2de201cef3d03a8 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Thu, 3 Oct 2024 09:52:17 -0700
Subject: [PATCH 08/24] initial encounter and condition

---
 fhirizer/htan2fhir.py | 201 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 178 insertions(+), 23 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index 8b05501..71c8ead 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -1,5 +1,7 @@
 import uuid
 import json
+
+import numpy as np
 import orjson
 import copy
 import glob
@@ -11,24 +13,30 @@
 from pathlib import Path
 import importlib.resources
 from uuid import uuid3, NAMESPACE_DNS
-from typing import Any
+from typing import Any, List, Optional
+from datetime import datetime
 
 from fhir.resources.reference import Reference
 from fhir.resources.identifier import Identifier
-from fhir.resources.codeableconcept import CodeableConcept
 from fhir.resources.patient import Patient
 from fhir.resources.address import Address
 from fhir.resources.researchstudy import ResearchStudy
 from fhir.resources.researchsubject import ResearchSubject
+from fhir.resources.observation import Observation
+from fhir.resources.encounter import Encounter
+from fhir.resources.codeableconcept import CodeableConcept
+from fhir.resources.age import Age
+from fhir.resources.procedure import Procedure
+from fhir.resources.bodystructure import BodyStructure, BodyStructureIncludedStructure
 from fhir.resources.specimen import Specimen, SpecimenProcessing, SpecimenCollection
 from fhir.resources.condition import Condition, ConditionStage
 from fhir.resources.documentreference import DocumentReference, DocumentReferenceContent, \
     DocumentReferenceContentProfile
 from fhir.resources.attachment import Attachment
-from fhir.resources.observation import Observation
 from fhir.resources.medicationadministration import MedicationAdministration
 from fhir.resources.medication import Medication
 
+
 # File data on synapse after authentication
 # https://github.com/Sage-Bionetworks/synapsePythonClient?tab=readme-ov-file#store-a-file-to-synapse
 
@@ -41,11 +49,13 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
         self.get_component = utils.get_component
         self.fhir_ndjson = utils.fhir_ndjson
         self.subprogram_name = subprogram_name
-        self.project_id = subprogram_name # incase there will be more granular project/program relations
+        self.project_id = subprogram_name  # incase there will be more granular project/program relations
         assert Path(out_dir).is_dir(), f"Path to out_dir {out_dir} is not a directory."
         self.out_dir = out_dir
         self.verbose = verbose
         self.SYSTEM_HTAN = 'https://data.humantumoratlas.org'
+        self.SYSTEM_SNOME = 'http://snomed.info/sct'
+        self.SYSTEM_LOINC = 'http://loinc.org'
         self.NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, self.SYSTEM_HTAN)
         self.read_json = utils._read_json
         self.fhir_ndjson = utils.fhir_ndjson
@@ -73,10 +83,11 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
         # cases to Patient / ResearchSubject / ResearchStudy / Observation -> Condition / Medication / MedicationAdministration / Procedure / Encounter
         # 'HTAN Participant ID':  #NOTE:  HTAN ID associated with a patient based on HTAN ID SOP
         # 'Therapeutic Agents':  #NOTE: Some have multiple comma-separated Medication.ingredient
-        self.cases_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/cases/table_data.tsv")
+        self.cases_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath(
+            "./raw/cases/table_data.tsv")
         assert self.cases_table_data_path.is_file(), f"Path {self.cases_table_data_path} is not a valid file path."
         self.cases = self.get_dataframe(self.cases_table_data_path, sep="\t")
-        self.patient_identifier_field = "HTAN Participant ID" # identifiers of the cases matrix/df
+        self.patient_identifier_field = "HTAN Participant ID"  # identifiers of the cases matrix/df
 
         self.biospecimen_mappings = self.get_biospecimen_mappings
 
@@ -95,8 +106,10 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
         # files_mapping
         # files to DocumentReference / Attachment / Observation -> DocumentReference
 
-        self.files_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/files/table_data.tsv")
-        self.files_drs_uri_path = Path(Path(self.project_path).parent / subprogram_name).joinpath("./raw/files/cds_manifest.csv")
+        self.files_table_data_path = Path(Path(self.project_path).parent / subprogram_name).joinpath(
+            "./raw/files/table_data.tsv")
+        self.files_drs_uri_path = Path(Path(self.project_path).parent / subprogram_name).joinpath(
+            "./raw/files/cds_manifest.csv")
         assert self.files_table_data_path.is_file(), f"Path {self.files_table_data_path} is not a valid file path."
         assert self.files_drs_uri_path.is_file(), f"Path {self.files_drs_uri_path} is not a valid file path."
 
@@ -127,7 +140,8 @@ def get_dataframe(_path, sep) -> pd.DataFrame:
     def get_patient_demographics(self) -> pd.DataFrame:
         """HTAN cases table_data.tsv data with Patient FHIR demographics mappings column/field match"""
         field_list = []
-        for field in self.get_htan_mapping(match='Patient', field_maps=self.cases_mappings(), map_info='fhir_map', fetch='field'):
+        for field in self.get_htan_mapping(match='Patient', field_maps=self.cases_mappings(), map_info='fhir_map',
+                                           fetch='field'):
             field_list.append(field)
             if self.verbose:
                 print(f"field name': {field}")
@@ -185,6 +199,24 @@ def get_fhir_maps_by_field(mapping_data, field_name=None):
                     _focus = mapping.get("focus", None)
                     _use = mapping.get("use", None)
                     yield _field, _fhir_map, _use, _focus
+
+    def get_field_value(self, _row: pd.Series, mapping_type: str, fhir_field: str) -> dict:
+        mapping_data = None
+        if mapping_type == "case":
+            mapping_data = self.cases_mappings()
+        elif mapping_data == "biospecimen":
+            mapping_data = self.biospecimen_mappings()
+        elif mapping_type == "file":
+            mapping_data = self.files_mappings()
+
+        _this_htan_field = None
+        for field, fhir_map, use, focus in self.get_fields_by_fhir_map(mapping_data=mapping_data,
+                                                                       fhir_mapping=fhir_field):
+            _this_htan_field = field
+        _filed_value = _row.get(_this_htan_field)
+
+        return {"htan_field": _this_htan_field, "htan_field_value": _filed_value}
+
     @staticmethod
     def decipher_htan_id(_id) -> dict:
         """
@@ -195,7 +227,7 @@ def decipher_htan_id(_id) -> dict:
         """
         deciphered_id = {}
         _id_substrings = _id.split("_")
-        participant_id = "_".join([_id_substrings[0],_id_substrings[1]])
+        participant_id = "_".join([_id_substrings[0], _id_substrings[1]])
         if 'EXT' not in _id_substrings[1] or '0000' not in _id_substrings[1]:
             deciphered_id = {"participant_id": participant_id, "subsets": _id_substrings}
         else:
@@ -226,16 +258,17 @@ def create_patient(self, _row: pd.Series) -> Patient:
             use = _use
         assert use, f"Patient.identifier use is not defined in ./resources/HTAN/cases.json mappings."
 
-        patient_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "value": _row['HTAN Participant ID'], "use": use})
+        patient_identifier = Identifier(
+            **{"system": self.SYSTEM_HTAN, "value": _row['HTAN Participant ID'], "use": use})
         patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id,
-                                   namespace=self.NAMESPACE_HTAN)
+                                  namespace=self.NAMESPACE_HTAN)
 
         deceasedBoolean_fields = []
-        for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(self.cases_mapping(), "Patient.deceasedBoolean"):
+        for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(self.cases_mapping(),
+                                                                           "Patient.deceasedBoolean"):
             deceasedBoolean_fields.append(_field)
         assert deceasedBoolean_fields, f"Patient.deceasedBoolean has no fields defined in ./resources/HTAN/cases.json mappings."
 
-
         vital_status = _row[deceasedBoolean_fields].dropna().unique().any()
         deceasedBoolean = {"Dead": True}.get(vital_status, False if vital_status else None)
 
@@ -258,7 +291,8 @@ def create_patient(self, _row: pd.Series) -> Patient:
 
     def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation:
         patient_observation_fields = []
-        for field, fhir_map, use, focus in self.get_fields_by_fhir_map(transformer.cases_mappings(), "Observation.component"):
+        for field, fhir_map, use, focus in self.get_fields_by_fhir_map(transformer.cases_mappings(),
+                                                                       "Observation.component"):
             if focus == "Patient":
                 patient_observation_fields.append(field)
 
@@ -268,9 +302,12 @@ def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation:
         components = []
         for key, value in _obervation_row.to_dict().items():
             if key != 'HTAN Participant ID':
-                if isinstance(value, float) and not pd.isna(value) and ("Year" in key or "Day" in key or "year" in key or "day" in key):
+                if isinstance(value, float) and not pd.isna(value) and (
+                        "Year" in key or "Day" in key or "year" in key or "day" in key):
                     value = int(value)
-                    _component = self.get_component(key=key, value=value, component_type=self.get_data_types(type(value).__name__), system=self.SYSTEM_HTAN)
+                    _component = self.get_component(key=key, value=value,
+                                                    component_type=self.get_data_types(type(value).__name__),
+                                                    system=self.SYSTEM_HTAN)
                     components.append(_component)
 
         observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.id})
@@ -295,8 +332,8 @@ def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation:
                               "code": {
                                   "coding": [
                                       {
-                                          "system": "http://loinc.org",
-                                          "code": "52460-3", # TODO: may need to change to be more specific
+                                          "system": self.SYSTEM_LOINC,
+                                          "code": "52460-3",  # TODO: may need to change to be more specific
                                           "display": "patient information"
                                       }
                                   ],
@@ -317,10 +354,11 @@ def create_researchstudy(self, _row: pd.Series) -> ResearchStudy:
         return ResearchStudy(**{"id": researchstudy_id,
                                 "identifier": [researchstudy_identifier],
                                 "name": study_name,
-                                "status": "open"}) # TODO: add "condition" snomed id
+                                "status": "open"})  # TODO: add "condition" snomed id
 
     def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> ResearchSubject:
-        researchsubject_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.identifier[0].value})
+        researchsubject_identifier = Identifier(
+            **{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.identifier[0].value})
         researchsubject_id = self.mint_id(identifier=researchsubject_identifier, resource_type="ResearchSubject",
                                           project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
         return ResearchSubject(**{"id": researchsubject_id,
@@ -329,13 +367,115 @@ def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> Rese
                                   "subject": Reference(**{"reference": f"Patient/{patient.id}"}),
                                   "study": Reference(**{"reference": f"ResearchStudy/{study.id}"})})
 
+    def create_encounter(self, _row: pd.Series, patient: Patient, condition: Optional[Condition],
+                         procedure: Optional[Procedure]) -> Encounter:
+        # identifier string = project / patient / [condition/procedure] - assume parent encounter atm
+        condition_procedure = ""
+        if condition:
+            condition_procedure = condition.id
+        elif procedure:
+            condition_procedure = procedure.id
+
+        encounter_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official",
+                                             "value": "/".join([self.subprogram_name, patient.identifier[0].value])})
+        encounter_id = self.mint_id(identifier=encounter_identifier, resource_type="Encounter",
+                                    project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+
+        return Encounter(**{"id": encounter_id,
+                            "identifier": [encounter_identifier],
+                            "status": "completed",
+                            "subject": Reference(**{"reference": f"Patient/{patient.id}"})
+                            })
+
+    def create_body_structure(self, _row, patient: Patient) -> BodyStructure:
+        body_structure_value = _row.get("Tissue or Organ of Origin")
+        included_structure = []
+        if body_structure_value:
+            included_structure = [BodyStructureIncludedStructure(**{"structure": CodeableConcept(**{"coding": [
+                {"code": body_structure_value, "system": self.SYSTEM_HTAN, "display": body_structure_value}]})})]
+            body_struct_ident = Identifier(
+                **{"system": self.SYSTEM_HTAN, "use": "official", "value": body_structure_value})
+        return BodyStructure(
+            **{"id": utils.mint_id(identifier=[patient.identifier[0].value, body_struct_ident],
+                                   resource_type="BodyStructure",
+                                   project_id=self.project_id,
+                                   namespace=self.NAMESPACE_HTAN),
+               "identifier": [body_struct_ident],
+               "includedStructure": included_structure,
+               "patient": Reference(**{"reference": f"Patient/{patient.id}"})
+               })
+
+    def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encounter,
+                         body_structure: Optional[BodyStructure]) -> Optional[Condition]:
+        primary_diagnosis = _row.get("Primary Diagnosis")
+        if pd.isnull(primary_diagnosis):
+            return None
+
+        # identifier string = project / patient / primary diagnosis
+        condition_identifier = Identifier(**{"system": self.SYSTEM_HTAN,
+                                             "use": "official",
+                                             "value": "/".join([self.subprogram_name, patient.id,
+                                                                primary_diagnosis])})
+        condition_id = self.mint_id(identifier=condition_identifier, resource_type="ResearchSubject",
+                                    project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+
+        onset_age = None
+        primary_diagnosis_age = self.get_field_value(_row=_row, mapping_type="case", fhir_field="Condition.onsetAge")
+
+        primary_diagnosis_age_value = None
+        if not np.isnan(primary_diagnosis_age["htan_field_value"]):
+            primary_diagnosis_age_value = int(primary_diagnosis_age["htan_field_value"])
+
+        if primary_diagnosis_age_value:
+            onset_age = Age(**{"value": primary_diagnosis_age_value,
+                               "unit": "years",
+                               "system": "http://unitsofmeasure.org",
+                               "code": "a"
+                               })
+
+        recorded_date_field_value = self.get_field_value(_row=_row, mapping_type="case",
+                                                         fhir_field="Condition.recordedDate")
+        recorded_date = None
+        if not np.isnan(recorded_date_field_value["htan_field_value"]):
+            recorded_date = datetime(int(recorded_date_field_value["htan_field_value"]), 1, 1)
+
+        body_structure = self.create_body_structure(_row, patient)
+        patient_body_structure_ref = Reference(**{"reference": f"BodyStructure/{body_structure.id}"}) if body_structure.includedStructure else None
+
+        patient_body_site_cc = []
+        patient_body_site = self.get_field_value(_row=_row, mapping_type="case", fhir_field="Condition.bodySite")["htan_field_value"]
+
+        if patient_body_site:
+            patient_body_site_cc = [CodeableConcept(**{"coding": [{"code": patient_body_site,
+                                                                     "system": self.SYSTEM_HTAN,
+                                                                     "display": patient_body_site}]})]
+
+        return Condition(**{"id": condition_id,
+                            "identifier": [condition_identifier],
+                            "code": CodeableConcept(**{"coding": [{"code": primary_diagnosis,
+                                                                   "system": self.SYSTEM_HTAN,
+                                                                   "display": primary_diagnosis}]}),
+                            "subject": Reference(**{"reference": f"Patient/{patient.id}"}),
+                            "clinicalStatus": CodeableConcept(**{"coding": [{"code": "active",
+                                                                             "system": "http://terminology.hl7.org/CodeSystem/condition-clinical" ,
+                                                                             "display": "Active"}]}),
+                            "onsetAge": onset_age,
+                            "recordedDate": recorded_date,
+                            "bodySite": patient_body_site_cc,
+                            # "bodyStructure": patient_body_structure_ref,
+                            "encounter": Reference(**{"reference": f"Encounter/{encounter.id}"}),
+                            "stage": [],
+                            })
+
+
 # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
 # 12/14 total Atlas
 atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"]
 for name in atlas_name:
 
     transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
-    patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
+    patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
+                                             verbose=False)
 
     patient_demographics_df = transformer.patient_demographics
     cases = transformer.cases
@@ -343,6 +483,8 @@ def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> Rese
     patients = []
     research_studies = []
     research_subjects = []
+    conditions = []
+    encounters = []
     for index, row in cases.iterrows():
 
         research_study = patient_transformer.create_researchstudy(_row=row)
@@ -362,10 +504,23 @@ def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> Rese
                 if research_subject:
                     research_subjects.append(research_subject)
 
+                encounter = patient_transformer.create_encounter(_row=row, patient=patient, condition=None,
+                                                                 procedure=None)
+                if encounter:
+                    encounters.append(encounter)
+                    condition = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter,
+                                                                     body_structure=None)
+
+                    if condition:
+                        conditions.append(condition)
+
     transformer.write_ndjson(research_subjects)
     transformer.write_ndjson(research_studies)
     transformer.write_ndjson(patients)
+    transformer.write_ndjson(encounters)
+    transformer.write_ndjson(conditions)
 
     # participant ids from specimen identifiers
     # transformer.decipher_htan_id(transformer.biospecimens["HTAN Biospecimen ID"][0])
-    # transformer.decipher_htan_id(transformer.cases["HTAN Participant ID"][0])
\ No newline at end of file
+    # transformer.decipher_htan_id(transformer.cases["HTAN Participant ID"][0])
+

From c58f467798a20a246e3f2a0cf3c7bd97e7656bbc Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Mon, 7 Oct 2024 07:06:50 -0700
Subject: [PATCH 09/24] condition observations + research study partOf

---
 fhirizer/htan2fhir.py | 108 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 101 insertions(+), 7 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index 71c8ead..c6f923e 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -60,6 +60,14 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
         self.read_json = utils._read_json
         self.fhir_ndjson = utils.fhir_ndjson
 
+        parent_researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": "HTAN"})
+        parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier, resource_type="ResearchStudy",
+                                        project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+        self.program_research_study = ResearchStudy(**{"id": parent_researchstudy_id,
+                                                       "identifier": [parent_researchstudy_identifier],
+                                                       "name": "HTAN",
+                                                       "status": "open"})
+
         self.project_path = str(
             Path(importlib.resources.files('fhirizer').parent / 'projects' / 'HTAN' / subprogram_name))
         assert Path(self.project_path).is_dir(), f"Path {self.project_path} is not a valid directory path."
@@ -118,6 +126,7 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
 
         self.patient_demographics = self.get_patient_demographics()
 
+
     def get_cases_mappings(self) -> dict:
         """HTAN cases FHIR mapping"""
         return self.read_json(self.cases_path)
@@ -308,6 +317,7 @@ def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation:
                     _component = self.get_component(key=key, value=value,
                                                     component_type=self.get_data_types(type(value).__name__),
                                                     system=self.SYSTEM_HTAN)
+
                     components.append(_component)
 
         observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": patient.id})
@@ -351,10 +361,13 @@ def create_researchstudy(self, _row: pd.Series) -> ResearchStudy:
         researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": study_name})
         researchstudy_id = self.mint_id(identifier=researchstudy_identifier, resource_type="ResearchStudy",
                                         project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+
+        # TODO: add "condition" snomed id
         return ResearchStudy(**{"id": researchstudy_id,
                                 "identifier": [researchstudy_identifier],
                                 "name": study_name,
-                                "status": "open"})  # TODO: add "condition" snomed id
+                                "status": "open",
+                                "partOf": [Reference(**{"reference": f"ResearchStudy/{self.program_research_study.id}"})]})
 
     def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> ResearchSubject:
         researchsubject_identifier = Identifier(
@@ -377,7 +390,7 @@ def create_encounter(self, _row: pd.Series, patient: Patient, condition: Optiona
             condition_procedure = procedure.id
 
         encounter_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official",
-                                             "value": "/".join([self.subprogram_name, patient.identifier[0].value])})
+                                             "value": "-".join([self.subprogram_name, patient.identifier[0].value])})
         encounter_id = self.mint_id(identifier=encounter_identifier, resource_type="Encounter",
                                     project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
 
@@ -414,7 +427,7 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
         # identifier string = project / patient / primary diagnosis
         condition_identifier = Identifier(**{"system": self.SYSTEM_HTAN,
                                              "use": "official",
-                                             "value": "/".join([self.subprogram_name, patient.id,
+                                             "value": "-".join([self.subprogram_name, patient.id,
                                                                 primary_diagnosis])})
         condition_id = self.mint_id(identifier=condition_identifier, resource_type="ResearchSubject",
                                     project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
@@ -447,8 +460,8 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
 
         if patient_body_site:
             patient_body_site_cc = [CodeableConcept(**{"coding": [{"code": patient_body_site,
-                                                                     "system": self.SYSTEM_HTAN,
-                                                                     "display": patient_body_site}]})]
+                                                                   "system": self.SYSTEM_HTAN,
+                                                                   "display": patient_body_site}]})]
 
         return Condition(**{"id": condition_id,
                             "identifier": [condition_identifier],
@@ -467,6 +480,78 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
                             "stage": [],
                             })
 
+    def create_observation(self, _row: pd.Series, patient: Patient,
+                           specimen: Optional[Specimen], official_focus: str,
+                           focus: List[Reference], components: Optional[List], category: Optional[dict]) -> Observation:
+        assert focus, f"Observation for patient {patient.id} is missing focus."
+
+        if not category:
+            category = [
+                {
+                    "coding": [
+                        {
+                            "system": "http://terminology.hl7.org/CodeSystem/observation-category",
+                            "code": "exam",
+                            "display": "exam"
+                        }
+                    ],
+                    "text": "Exam"
+                }
+        ]
+
+        observation_fields = []
+        for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(transformer.cases_mappings(),
+                                                                       "Observation.component"):
+            if _focus == official_focus:
+                observation_fields.append(_field)
+
+        _obervation_row = _row[observation_fields] if observation_fields else None
+
+        if _obervation_row is not None:
+            components = []
+            for key, value in _obervation_row.to_dict().items():
+                if key != 'HTAN Participant ID':
+                    try:
+                        if not pd.isnull(value):
+                            if not isinstance(value, str) and value.is_integer():
+                                value = int(value)
+                            _component = self.get_component(key=key, value=value,
+                                                                component_type=self.get_data_types(type(value).__name__),
+                                                                system=self.SYSTEM_HTAN)
+                            components.append(_component)
+                    except (ValueError, TypeError):
+                        if self.verbose:
+                            print(f"Components {key}: {value} can't be added to list - value/type error.")
+
+        focus_ids = [r.reference.split("/")[1] for r in focus]
+        observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN,
+                                               "use": "official",
+                                               "value": "-".join([patient.identifier[0].value] + focus_ids)})
+        observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation",
+                                      project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+        specimen_ref = None
+        if specimen:
+            specimen_ref = Reference(**{"reference": f"Specimen/{specimen.id}"})
+        # add valueCodeableConcept as needed after creation
+        return Observation(**{"id": observation_id,
+                              "identifier": [observation_identifier],
+                              "status": "final",
+                              "category": category,
+                              "code": {
+                                  "coding": [
+                                      {
+                                          "system": self.SYSTEM_LOINC,
+                                          "code": "75323-6", # TODO: place-holder
+                                          "display": "Condition"
+                                      }
+                                  ],
+                                  "text": "Condition"
+                              },
+                              "focus": focus,
+                              "subject": Reference(**{"reference": f"Patient/{patient.id}"}),
+                              "component": components,
+                              "specimen": specimen_ref})
+
 
 # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
 # 12/14 total Atlas
@@ -485,20 +570,24 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
     research_subjects = []
     conditions = []
     encounters = []
+    observations = []
     for index, row in cases.iterrows():
 
         research_study = patient_transformer.create_researchstudy(_row=row)
 
         if research_study:
+            research_studies.append(transformer.program_research_study)
             research_studies.append(research_study)
 
             patient_row = cases.iloc[index][patient_demographics_df.columns]
             patient = patient_transformer.create_patient(_row=patient_row)
             patient_obs = patient_transformer.patient_observation(patient=patient, _row=row)
+            if patient_obs:
+                observations.append(patient_obs)
             if patient:
                 patients.append(patient)
-                print(f"HTAN FHIR Patient: {patient.json()}")
-                print(f"HTAN FHIR Patient Observation: {patient_obs.json()}")
+                # print(f"HTAN FHIR Patient: {patient.json()}")
+                # print(f"HTAN FHIR Patient Observation: {patient_obs.json()}")
 
                 research_subject = patient_transformer.create_researchsubject(patient, research_study)
                 if research_subject:
@@ -514,11 +603,16 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
                     if condition:
                         conditions.append(condition)
 
+                        condition_observation = patient_transformer.create_observation(_row=row, patient=patient, official_focus="Condition", focus=[Reference(**{"reference": f"Condition/{condition.id}"})], specimen=None, components=None, category=None)
+                        if condition_observation:
+                            observations.append(condition_observation)
+
     transformer.write_ndjson(research_subjects)
     transformer.write_ndjson(research_studies)
     transformer.write_ndjson(patients)
     transformer.write_ndjson(encounters)
     transformer.write_ndjson(conditions)
+    transformer.write_ndjson(observations)
 
     # participant ids from specimen identifiers
     # transformer.decipher_htan_id(transformer.biospecimens["HTAN Biospecimen ID"][0])

From 2e5c4f50f78a45fed83ac4ec1d0fc65b2f666075 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Mon, 7 Oct 2024 07:52:48 -0700
Subject: [PATCH 10/24] initial specimen

---
 fhirizer/htan2fhir.py | 107 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 90 insertions(+), 17 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index c6f923e..695bde3 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -61,8 +61,9 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
         self.fhir_ndjson = utils.fhir_ndjson
 
         parent_researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": "HTAN"})
-        parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier, resource_type="ResearchStudy",
-                                        project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+        parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier,
+                                               resource_type="ResearchStudy",
+                                               project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
         self.program_research_study = ResearchStudy(**{"id": parent_researchstudy_id,
                                                        "identifier": [parent_researchstudy_identifier],
                                                        "name": "HTAN",
@@ -126,7 +127,6 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
 
         self.patient_demographics = self.get_patient_demographics()
 
-
     def get_cases_mappings(self) -> dict:
         """HTAN cases FHIR mapping"""
         return self.read_json(self.cases_path)
@@ -367,7 +367,8 @@ def create_researchstudy(self, _row: pd.Series) -> ResearchStudy:
                                 "identifier": [researchstudy_identifier],
                                 "name": study_name,
                                 "status": "open",
-                                "partOf": [Reference(**{"reference": f"ResearchStudy/{self.program_research_study.id}"})]})
+                                "partOf": [
+                                    Reference(**{"reference": f"ResearchStudy/{self.program_research_study.id}"})]})
 
     def create_researchsubject(self, patient: Patient, study: ResearchStudy) -> ResearchSubject:
         researchsubject_identifier = Identifier(
@@ -453,10 +454,12 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
             recorded_date = datetime(int(recorded_date_field_value["htan_field_value"]), 1, 1)
 
         body_structure = self.create_body_structure(_row, patient)
-        patient_body_structure_ref = Reference(**{"reference": f"BodyStructure/{body_structure.id}"}) if body_structure.includedStructure else None
+        patient_body_structure_ref = Reference(
+            **{"reference": f"BodyStructure/{body_structure.id}"}) if body_structure.includedStructure else None
 
         patient_body_site_cc = []
-        patient_body_site = self.get_field_value(_row=_row, mapping_type="case", fhir_field="Condition.bodySite")["htan_field_value"]
+        patient_body_site = self.get_field_value(_row=_row, mapping_type="case", fhir_field="Condition.bodySite")[
+            "htan_field_value"]
 
         if patient_body_site:
             patient_body_site_cc = [CodeableConcept(**{"coding": [{"code": patient_body_site,
@@ -470,7 +473,7 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
                                                                    "display": primary_diagnosis}]}),
                             "subject": Reference(**{"reference": f"Patient/{patient.id}"}),
                             "clinicalStatus": CodeableConcept(**{"coding": [{"code": "active",
-                                                                             "system": "http://terminology.hl7.org/CodeSystem/condition-clinical" ,
+                                                                             "system": "http://terminology.hl7.org/CodeSystem/condition-clinical",
                                                                              "display": "Active"}]}),
                             "onsetAge": onset_age,
                             "recordedDate": recorded_date,
@@ -497,11 +500,11 @@ def create_observation(self, _row: pd.Series, patient: Patient,
                     ],
                     "text": "Exam"
                 }
-        ]
+            ]
 
         observation_fields = []
         for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(transformer.cases_mappings(),
-                                                                       "Observation.component"):
+                                                                           "Observation.component"):
             if _focus == official_focus:
                 observation_fields.append(_field)
 
@@ -516,8 +519,8 @@ def create_observation(self, _row: pd.Series, patient: Patient,
                             if not isinstance(value, str) and value.is_integer():
                                 value = int(value)
                             _component = self.get_component(key=key, value=value,
-                                                                component_type=self.get_data_types(type(value).__name__),
-                                                                system=self.SYSTEM_HTAN)
+                                                            component_type=self.get_data_types(type(value).__name__),
+                                                            system=self.SYSTEM_HTAN)
                             components.append(_component)
                     except (ValueError, TypeError):
                         if self.verbose:
@@ -541,7 +544,7 @@ def create_observation(self, _row: pd.Series, patient: Patient,
                                   "coding": [
                                       {
                                           "system": self.SYSTEM_LOINC,
-                                          "code": "75323-6", # TODO: place-holder
+                                          "code": "75323-6",  # TODO: place-holder
                                           "display": "Condition"
                                       }
                                   ],
@@ -552,18 +555,76 @@ def create_observation(self, _row: pd.Series, patient: Patient,
                               "component": components,
                               "specimen": specimen_ref})
 
+    def create_medication_administration(self) -> MedicationAdministration:
+        return MedicationAdministration(**{})
+
+
+class SpecimenTransformer(HTANTransformer):
+    def __init__(self, *args: Any, **kwargs: Any):
+        super().__init__(**kwargs)
+        self.cases_mapping = self.cases_mappings
+        self.NAMESPACE_HTAN = self.NAMESPACE_HTAN
+        self.get_data_types = utils.get_data_types
+        self.get_component = self.get_component
+        self.get_fields_by_fhir_map = self.get_fields_by_fhir_map
+
+    def create_specimen(self, _row: pd.Series) -> Specimen:
+        """Transform HTAN biospecimen to FHIR Specimen"""
+
+        specimen_identifier = Identifier(
+            **{"system": self.SYSTEM_HTAN, "value": _row['HTAN Biospecimen ID'], "use": "official"})
+        specimen_id = self.mint_id(identifier=specimen_identifier, resource_type="Specimen", project_id=self.project_id,
+                                   namespace=self.NAMESPACE_HTAN)
+
+        # participant id from specimen identifier
+        participant_id = self.decipher_htan_id(_row["HTAN Biospecimen ID"])["participant_id"]
+        assert participant_id, f"Specimen {_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it."
+
+        patient_identifier = Identifier(
+            **{"system": self.SYSTEM_HTAN, "value": participant_id, "use": "official"})
+        patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id,
+                                  namespace=self.NAMESPACE_HTAN)
+        subject = Reference(**{"reference": f"Patient/{patient_id}"}) # Check if Group exists
+
+        parent_specimen_reference = []
+        if not pd.isnull(_row["HTAN Parent ID"]):
+            parent_specimen_identifier = Identifier(
+                **{"system": self.SYSTEM_HTAN, "value": _row['HTAN Biospecimen ID'], "use": "official"})
+            parent_specimen_id = self.mint_id(identifier=parent_specimen_identifier, resource_type="Specimen", project_id=self.project_id,
+                                       namespace=self.NAMESPACE_HTAN)
+            parent_specimen_reference.append(Reference(**{"reference": f"Specimen/{parent_specimen_id}"}))
+
+        specimen_fields = []
+        for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(self.cases_mapping(),
+                                                                           "Specimen"):
+            specimen_fields.append(_field)
+
+        return Specimen(**{"id": specimen_id,
+                           "identifier": [specimen_identifier],
+                           "type": CodeableConcept(**{"coding": [
+                               {"code": _row["Biospecimen Type"], "system": self.SYSTEM_HTAN,
+                                "display": _row["Biospecimen Type"]}]}),
+                           "processing": [SpecimenProcessing(**{"method": CodeableConcept(**{"coding": [
+                               {"code": _row["Preservation Method"], "system": self.SYSTEM_HTAN,
+                                "display": _row["Preservation Method"]}]})})],
+                           "parent": parent_specimen_reference,
+                           "subject": subject})
+
 
 # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
 # 12/14 total Atlas
-atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"]
+# atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"]
+atlas_name = ["OHSU"]
 for name in atlas_name:
 
     transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
     patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
                                              verbose=False)
+    specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
 
     patient_demographics_df = transformer.patient_demographics
     cases = transformer.cases
+    htan_biospecimens = transformer.biospecimens
 
     patients = []
     research_studies = []
@@ -603,18 +664,30 @@ def create_observation(self, _row: pd.Series, patient: Patient,
                     if condition:
                         conditions.append(condition)
 
-                        condition_observation = patient_transformer.create_observation(_row=row, patient=patient, official_focus="Condition", focus=[Reference(**{"reference": f"Condition/{condition.id}"})], specimen=None, components=None, category=None)
+                        condition_observation = patient_transformer.create_observation(_row=row, patient=patient,
+                                                                                       official_focus="Condition",
+                                                                                       focus=[Reference(**{
+                                                                                           "reference": f"Condition/{condition.id}"})],
+                                                                                       specimen=None, components=None,
+                                                                                       category=None)
                         if condition_observation:
                             observations.append(condition_observation)
 
+    specimens = []
+    for index, row in htan_biospecimens.iterrows():
+        specimen_row = htan_biospecimens.iloc[index]
+        specimen = specimen_transformer.create_specimen(_row=specimen_row)
+        if specimen:
+            specimens.append(specimen)
+
     transformer.write_ndjson(research_subjects)
     transformer.write_ndjson(research_studies)
     transformer.write_ndjson(patients)
     transformer.write_ndjson(encounters)
     transformer.write_ndjson(conditions)
     transformer.write_ndjson(observations)
+    transformer.write_ndjson(specimens)
 
     # participant ids from specimen identifiers
-    # transformer.decipher_htan_id(transformer.biospecimens["HTAN Biospecimen ID"][0])
-    # transformer.decipher_htan_id(transformer.cases["HTAN Participant ID"][0])
-
+    # print(transformer.decipher_htan_id(htan_biospecimens["HTAN Biospecimen ID"][0]))
+    # print(transformer.decipher_htan_id(cases["HTAN Participant ID"][0]))

From 4815b352683d64697eca34b84eed4d851e3208d9 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Mon, 7 Oct 2024 08:56:43 -0700
Subject: [PATCH 11/24] specimen observations

---
 fhirizer/htan2fhir.py | 229 ++++++++++++++++++++++++++----------------
 1 file changed, 144 insertions(+), 85 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index 695bde3..c21fb1f 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -244,6 +244,102 @@ def decipher_htan_id(_id) -> dict:
             deciphered_id = {"participant_id": participant_id, "subsets": _id_substrings}
         return deciphered_id
 
+    def create_observation(self, _row: pd.Series, patient: Optional[Patient], patient_id: str,
+                           specimen: Optional[Specimen], official_focus: str,
+                           focus: List[Reference], components: Optional[List], category: Optional[list]) -> Observation:
+        assert patient_id, f"Observation is missing patient id: {patient_id}."
+        assert focus, f"Observation for patient {patient_id} is missing focus."
+
+        if not category:
+            category = [
+                {
+                    "coding": [
+                        {
+                            "system": "http://terminology.hl7.org/CodeSystem/observation-category",
+                            "code": "exam",
+                            "display": "exam"
+                        }
+                    ],
+                    "text": "Exam"
+                }
+            ]
+
+        observation_fields = []
+
+        if official_focus not in ["Specimen"]:
+            mappings = transformer.cases_mappings()
+            code = {
+                "coding": [
+                    {
+                        "system": self.SYSTEM_LOINC,
+                        "code": "68992-7",
+                        "display": "Specimen-related information panel"
+                    }
+                ],
+                "text": "Specimen-related information panel"
+            }
+        else:
+            mappings = transformer.biospecimen_mappings()
+            code = {
+                "coding": [
+                    {
+                        "system": "http://loinc.org",
+                        "code": "75323-6",
+                        "display": "Condition"
+                    }
+                ],
+                "text": "Condition"
+            }
+
+        for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(mappings,
+                                                                           "Observation.component"):
+            if _focus == official_focus:
+                observation_fields.append(_field)
+
+        _obervation_row = _row[observation_fields] if observation_fields else None
+
+        if _obervation_row is not None:
+            components = []
+            for key, value in _obervation_row.to_dict().items():
+                if key != 'HTAN Participant ID':
+                    try:
+                        if not pd.isnull(value):
+                            if not isinstance(value, str) and value.is_integer():
+                                value = int(value)
+                            _component = self.get_component(key=key, value=value,
+                                                            component_type=self.get_data_types(type(value).__name__),
+                                                            system=self.SYSTEM_HTAN)
+                            components.append(_component)
+                    except (ValueError, TypeError):
+                        if self.verbose:
+                            print(f"Components {key}: {value} can't be added to list - value/type error.")
+
+        focus_ids = [r.reference.split("/")[1] for r in focus]
+
+        if patient:
+            identifier_value = "-".join([patient.identifier[0].value] + focus_ids)
+        else:
+            identifier_value = "-".join(focus_ids)
+
+        observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN,
+                                               "use": "official",
+                                               "value": identifier_value})
+        observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation",
+                                      project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+        specimen_ref = None
+        if specimen:
+            specimen_ref = Reference(**{"reference": f"Specimen/{specimen.id}"})
+        # add valueCodeableConcept as needed after creation
+        return Observation(**{"id": observation_id,
+                              "identifier": [observation_identifier],
+                              "status": "final",
+                              "category": category,
+                              "code": code,
+                              "focus": focus,
+                              "subject": Reference(**{"reference": f"Patient/{patient_id}"}),
+                              "component": components,
+                              "specimen": specimen_ref})
+
     def write_ndjson(self, entities):
         resource_type = entities[0].resource_type
         entities = [orjson.loads(entity.json()) for entity in entities]
@@ -259,6 +355,7 @@ def __init__(self, *args: Any, **kwargs: Any):
         self.get_data_types = utils.get_data_types
         self.get_component = self.get_component
         self.get_fields_by_fhir_map = self.get_fields_by_fhir_map
+        self.create_observation = self.create_observation
 
     def create_patient(self, _row: pd.Series) -> Patient:
         """Transform HTAN case demographics to FHIR Patient"""
@@ -483,78 +580,6 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
                             "stage": [],
                             })
 
-    def create_observation(self, _row: pd.Series, patient: Patient,
-                           specimen: Optional[Specimen], official_focus: str,
-                           focus: List[Reference], components: Optional[List], category: Optional[dict]) -> Observation:
-        assert focus, f"Observation for patient {patient.id} is missing focus."
-
-        if not category:
-            category = [
-                {
-                    "coding": [
-                        {
-                            "system": "http://terminology.hl7.org/CodeSystem/observation-category",
-                            "code": "exam",
-                            "display": "exam"
-                        }
-                    ],
-                    "text": "Exam"
-                }
-            ]
-
-        observation_fields = []
-        for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(transformer.cases_mappings(),
-                                                                           "Observation.component"):
-            if _focus == official_focus:
-                observation_fields.append(_field)
-
-        _obervation_row = _row[observation_fields] if observation_fields else None
-
-        if _obervation_row is not None:
-            components = []
-            for key, value in _obervation_row.to_dict().items():
-                if key != 'HTAN Participant ID':
-                    try:
-                        if not pd.isnull(value):
-                            if not isinstance(value, str) and value.is_integer():
-                                value = int(value)
-                            _component = self.get_component(key=key, value=value,
-                                                            component_type=self.get_data_types(type(value).__name__),
-                                                            system=self.SYSTEM_HTAN)
-                            components.append(_component)
-                    except (ValueError, TypeError):
-                        if self.verbose:
-                            print(f"Components {key}: {value} can't be added to list - value/type error.")
-
-        focus_ids = [r.reference.split("/")[1] for r in focus]
-        observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN,
-                                               "use": "official",
-                                               "value": "-".join([patient.identifier[0].value] + focus_ids)})
-        observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation",
-                                      project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
-        specimen_ref = None
-        if specimen:
-            specimen_ref = Reference(**{"reference": f"Specimen/{specimen.id}"})
-        # add valueCodeableConcept as needed after creation
-        return Observation(**{"id": observation_id,
-                              "identifier": [observation_identifier],
-                              "status": "final",
-                              "category": category,
-                              "code": {
-                                  "coding": [
-                                      {
-                                          "system": self.SYSTEM_LOINC,
-                                          "code": "75323-6",  # TODO: place-holder
-                                          "display": "Condition"
-                                      }
-                                  ],
-                                  "text": "Condition"
-                              },
-                              "focus": focus,
-                              "subject": Reference(**{"reference": f"Patient/{patient.id}"}),
-                              "component": components,
-                              "specimen": specimen_ref})
-
     def create_medication_administration(self) -> MedicationAdministration:
         return MedicationAdministration(**{})
 
@@ -567,6 +592,7 @@ def __init__(self, *args: Any, **kwargs: Any):
         self.get_data_types = utils.get_data_types
         self.get_component = self.get_component
         self.get_fields_by_fhir_map = self.get_fields_by_fhir_map
+        self.create_observation = self.create_observation
 
     def create_specimen(self, _row: pd.Series) -> Specimen:
         """Transform HTAN biospecimen to FHIR Specimen"""
@@ -580,18 +606,16 @@ def create_specimen(self, _row: pd.Series) -> Specimen:
         participant_id = self.decipher_htan_id(_row["HTAN Biospecimen ID"])["participant_id"]
         assert participant_id, f"Specimen {_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it."
 
-        patient_identifier = Identifier(
-            **{"system": self.SYSTEM_HTAN, "value": participant_id, "use": "official"})
-        patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id,
-                                  namespace=self.NAMESPACE_HTAN)
-        subject = Reference(**{"reference": f"Patient/{patient_id}"}) # Check if Group exists
+        patient_id = self.get_specimen_patient(_row=_row)
+        subject = Reference(**{"reference": f"Patient/{patient_id}"})  # Check if Group exists
 
         parent_specimen_reference = []
         if not pd.isnull(_row["HTAN Parent ID"]):
             parent_specimen_identifier = Identifier(
                 **{"system": self.SYSTEM_HTAN, "value": _row['HTAN Biospecimen ID'], "use": "official"})
-            parent_specimen_id = self.mint_id(identifier=parent_specimen_identifier, resource_type="Specimen", project_id=self.project_id,
-                                       namespace=self.NAMESPACE_HTAN)
+            parent_specimen_id = self.mint_id(identifier=parent_specimen_identifier, resource_type="Specimen",
+                                              project_id=self.project_id,
+                                              namespace=self.NAMESPACE_HTAN)
             parent_specimen_reference.append(Reference(**{"reference": f"Specimen/{parent_specimen_id}"}))
 
         specimen_fields = []
@@ -610,17 +634,27 @@ def create_specimen(self, _row: pd.Series) -> Specimen:
                            "parent": parent_specimen_reference,
                            "subject": subject})
 
+    def get_specimen_patient(self, _row) -> str:
+        participant_id = self.decipher_htan_id(_row["HTAN Biospecimen ID"])["participant_id"]
+        assert participant_id, f"Specimen {_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it."
+
+        patient_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "value": participant_id, "use": "official"})
+        patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id,
+                                  namespace=self.NAMESPACE_HTAN)
+        return patient_id
+
 
 # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
 # 12/14 total Atlas
-# atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"]
-atlas_name = ["OHSU"]
+atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA",
+              "Vanderbilt"]
 for name in atlas_name:
 
     transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
     patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
                                              verbose=False)
-    specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
+    specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
+                                               verbose=False)
 
     patient_demographics_df = transformer.patient_demographics
     cases = transformer.cases
@@ -633,7 +667,6 @@ def create_specimen(self, _row: pd.Series) -> Specimen:
     encounters = []
     observations = []
     for index, row in cases.iterrows():
-
         research_study = patient_transformer.create_researchstudy(_row=row)
 
         if research_study:
@@ -665,6 +698,7 @@ def create_specimen(self, _row: pd.Series) -> Specimen:
                         conditions.append(condition)
 
                         condition_observation = patient_transformer.create_observation(_row=row, patient=patient,
+                                                                                       patient_id=patient.id,
                                                                                        official_focus="Condition",
                                                                                        focus=[Reference(**{
                                                                                            "reference": f"Condition/{condition.id}"})],
@@ -674,12 +708,37 @@ def create_specimen(self, _row: pd.Series) -> Specimen:
                             observations.append(condition_observation)
 
     specimens = []
-    for index, row in htan_biospecimens.iterrows():
-        specimen_row = htan_biospecimens.iloc[index]
+    for specimen_index, specimen_row in htan_biospecimens.iterrows():
+        # specimen_row = htan_biospecimens.iloc[specimen_index]
         specimen = specimen_transformer.create_specimen(_row=specimen_row)
         if specimen:
             specimens.append(specimen)
 
+            specimen_observation_category = [
+                {
+                    "coding": [
+                        {
+                            "system": "http://terminology.hl7.org/CodeSystem/observation-category",
+                            "code": "laboratory",
+                            "display": "laboratory"
+                        }
+                    ],
+                    "text": "Laboratory"
+                }
+            ]
+
+            specimen_participant_id = specimen_transformer.get_specimen_patient(_row=specimen_row)
+            specimen_observation = specimen_transformer.create_observation(_row=specimen_row, patient=None,
+                                                                           official_focus="Specimen",
+                                                                           focus=[Reference(**{
+                                                                               "reference": f"Specimen/{specimen.id}"})],
+                                                                           patient_id=specimen_participant_id,
+                                                                           specimen=specimen, components=None,
+                                                                           category=specimen_observation_category)
+            # print(specimen_observation.component)
+            if specimen_observation:
+                observations.append(specimen_observation)
+
     transformer.write_ndjson(research_subjects)
     transformer.write_ndjson(research_studies)
     transformer.write_ndjson(patients)

From a70f3489f702754fb4d614142d21e818241ed955 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Mon, 7 Oct 2024 11:57:34 -0700
Subject: [PATCH 12/24] document reference - in progress

---
 fhirizer/htan2fhir.py               | 145 +++++++++++++++++++++++++---
 resources/htan_resources/files.json |   8 +-
 2 files changed, 135 insertions(+), 18 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index c21fb1f..8ab7860 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import orjson
+import mimetypes
 import copy
 import glob
 import pathlib
@@ -31,7 +32,7 @@
 from fhir.resources.specimen import Specimen, SpecimenProcessing, SpecimenCollection
 from fhir.resources.condition import Condition, ConditionStage
 from fhir.resources.documentreference import DocumentReference, DocumentReferenceContent, \
-    DocumentReferenceContentProfile
+    DocumentReferenceContentProfile, DocumentReferenceRelatesTo
 from fhir.resources.attachment import Attachment
 from fhir.resources.medicationadministration import MedicationAdministration
 from fhir.resources.medication import Medication
@@ -127,6 +128,15 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
 
         self.patient_demographics = self.get_patient_demographics()
 
+        # combine and create standard fhir files metadata
+        # print(self.files["Filename"].str.split('/')[1])
+        self.files = self.files[self.files["Filename"].str.contains('.')] # NOTE: HTAPP contains file names ex. HTA1_982_7629309080080, that do not have any metadata
+        self.files = self.files[self.files["Filename"].str.contains('/')]
+
+        self.files['mime_type'] = self.files["Filename"].apply(lambda x: mimetypes.guess_type(x)[0])
+        self.files['name'] = self.files["Filename"].str.split('/').apply(lambda x: x[1])
+        self.files_drs_meta = self.files.merge(self.files_drs_uri, how="left", on="name")
+
     def get_cases_mappings(self) -> dict:
         """HTAN cases FHIR mapping"""
         return self.read_json(self.cases_path)
@@ -340,6 +350,12 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien
                               "component": components,
                               "specimen": specimen_ref})
 
+    def get_patient_id(self, participant_id) -> str:
+        patient_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "value": participant_id, "use": "official"})
+        patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id,
+                                  namespace=self.NAMESPACE_HTAN)
+        return patient_id
+
     def write_ndjson(self, entities):
         resource_type = entities[0].resource_type
         entities = [orjson.loads(entity.json()) for entity in entities]
@@ -593,6 +609,7 @@ def __init__(self, *args: Any, **kwargs: Any):
         self.get_component = self.get_component
         self.get_fields_by_fhir_map = self.get_fields_by_fhir_map
         self.create_observation = self.create_observation
+        self.get_patient_id = self.get_patient_id
 
     def create_specimen(self, _row: pd.Series) -> Specimen:
         """Transform HTAN biospecimen to FHIR Specimen"""
@@ -606,7 +623,7 @@ def create_specimen(self, _row: pd.Series) -> Specimen:
         participant_id = self.decipher_htan_id(_row["HTAN Biospecimen ID"])["participant_id"]
         assert participant_id, f"Specimen {_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it."
 
-        patient_id = self.get_specimen_patient(_row=_row)
+        patient_id = self.get_patient_id(participant_id=participant_id)
         subject = Reference(**{"reference": f"Patient/{patient_id}"})  # Check if Group exists
 
         parent_specimen_reference = []
@@ -634,31 +651,121 @@ def create_specimen(self, _row: pd.Series) -> Specimen:
                            "parent": parent_specimen_reference,
                            "subject": subject})
 
-    def get_specimen_patient(self, _row) -> str:
-        participant_id = self.decipher_htan_id(_row["HTAN Biospecimen ID"])["participant_id"]
-        assert participant_id, f"Specimen {_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it."
 
-        patient_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "value": participant_id, "use": "official"})
-        patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id,
-                                  namespace=self.NAMESPACE_HTAN)
-        return patient_id
+class DocumentReferenceTransformer(HTANTransformer):
+    def __init__(self, *args: Any, **kwargs: Any):
+        super().__init__(**kwargs)
+        self.cases_mapping = self.cases_mappings
+        self.NAMESPACE_HTAN = self.NAMESPACE_HTAN
+        self.get_data_types = utils.get_data_types
+        self.get_component = self.get_component
+        self.get_fields_by_fhir_map = self.get_fields_by_fhir_map
+        self.create_observation = self.create_observation
+        self.get_patient_id = self.get_patient_id
+
+    def create_document_reference(self, _row: pd.Series) -> DocumentReference:
+        """Transform HTAN files to FHIR DocumentReference"""
+
+        document_reference_identifier = Identifier(
+            **{"system": self.SYSTEM_HTAN, "value": _row['HTAN Data File ID'], "use": "official"})
+
+        document_reference_synapse_identifier = Identifier(
+            **{"system": self.SYSTEM_HTAN, "value": _row['Synapse Id'], "use": "secondary"})
+
+        document_reference_id = self.mint_id(identifier=document_reference_identifier,
+                                             resource_type="DocumentReference", project_id=self.project_id,
+                                             namespace=self.NAMESPACE_HTAN)
+
+        # participant id
+        patient_id = None
+        if "HTAN Participant ID" in _row.keys() and not pd.isnull(_row["HTAN Participant ID"]):
+            participant_id = _row["HTAN Participant ID"]
+            assert participant_id, f"DocumentRefernce {_row["HTAN Data File ID"]} does not have a patient participant associated with it."
+            patient_id = self.get_patient_id(participant_id=participant_id)
+
+        name = None
+        if _row["Filename"]:
+            name = _row["Filename"]
+
+        profiles = []
+        if not pd.isnull(_row['drs_uri']):
+            uri_profile = DocumentReferenceContentProfile(**{"valueUri": _row['drs_uri']})
+            profiles.append(uri_profile)
+
+        category = []
+        if not pd.isnull(_row['Assay']):
+            category.append(CodeableConcept(**{"coding": [{"code": _row['Assay'], "display": _row['Assay'], "system": "/".join([self.SYSTEM_HTAN, "Assay"])}]}))
+        if not pd.isnull(_row['Level']):
+            category.append(CodeableConcept(**{"coding": [{"code": _row['Level'], "display": _row['Level'], "system": "/".join([self.SYSTEM_HTAN, "Level"])}]}))
+
+        subject = None
+        if patient_id:
+            Reference(**{"reference": f"Patient/{patient_id}"})
+
+        based_on = []
+        if not pd.isnull(_row['Biospecimen']):
+            specimen_identifier = Identifier(
+                **{"system": self.SYSTEM_HTAN, "value": _row['Biospecimen'], "use": "official"})
+            specimen_id = self.mint_id(identifier=specimen_identifier, resource_type="Specimen",
+                                       project_id=self.project_id,
+                                       namespace=self.NAMESPACE_HTAN)
+            based_on.append(Reference(**{"reference": f"Specimen/{specimen_id}"}))
+
+        security_label = []
+        if not pd.isnull(_row['Data Access']):
+            security_label.append(CodeableConcept(**{"coding": [{"code":_row['Data Access'], "display": _row['Data Access'], "system": "/".join([self.SYSTEM_HTAN, "Data_Access"])}]}))
+
+        parent_data_file = []
+        if not pd.isnull(_row["Parent Data File ID"]):
+            parent_document_reference_identifier = Identifier(
+                **{"system": self.SYSTEM_HTAN, "value": _row["Parent Data File ID"], "use": "official"})
+
+            parent_document_reference_id = self.mint_id(identifier=parent_document_reference_identifier,
+                                                 resource_type="DocumentReference", project_id=self.project_id,
+                                                 namespace=self.NAMESPACE_HTAN)
+
+            parent_data_file.append(DocumentReferenceRelatesTo(**{
+                "code": CodeableConcept(**{"coding": [{"code": "parent_data_file",
+                                                       "system": "/".join([self.SYSTEM_HTAN, "Parent_Data_File_ID"]),
+                                                       "display": "parent_data_file"}]}),
+                "target": Reference(**{"reference": f"Documentreference/{parent_document_reference_id}"})}))
+
+        return DocumentReference(**{"id": document_reference_id,
+                                    "identifier": [document_reference_identifier, document_reference_synapse_identifier],
+                                    "status": "current",
+                                    "docStatus": "final",
+                                    # "basedOn": based_on, # TODO: requires check for specimen - missing data
+                                    "subject": subject,
+                                    # "relatesTo": parent_data_file,  # TODO: requires check for file - missing data
+                                    "category": category,
+                                    "securityLabel": security_label,
+                                    "content": [DocumentReferenceContent(
+                                        **{"attachment": Attachment(**{"title": name, "contentType": _row["mime_type"]}),
+                                           "profile": profiles
+                                           })]
+                                    })
 
 
 # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
 # 12/14 total Atlas
-atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA",
-              "Vanderbilt"]
-for name in atlas_name:
+atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"]
 
+for name in atlas_name:
+    # print(name)
     transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
     patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
                                              verbose=False)
     specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
                                                verbose=False)
+    documentreference_transformer = DocumentReferenceTransformer(subprogram_name=name,
+                                                                 out_dir=f"./projects/HTAN/{name}/META",
+                                                                 verbose=False)
 
     patient_demographics_df = transformer.patient_demographics
     cases = transformer.cases
     htan_biospecimens = transformer.biospecimens
+    files = transformer.files
+    files_drs_meta = transformer.files_drs_meta
 
     patients = []
     research_studies = []
@@ -727,7 +834,11 @@ def get_specimen_patient(self, _row) -> str:
                 }
             ]
 
-            specimen_participant_id = specimen_transformer.get_specimen_patient(_row=specimen_row)
+            participant_id = specimen_transformer.decipher_htan_id(specimen_row["HTAN Biospecimen ID"])[
+                "participant_id"]
+            assert participant_id, f"Specimen {specimen_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it."
+
+            specimen_participant_id = specimen_transformer.get_patient_id(participant_id=participant_id)
             specimen_observation = specimen_transformer.create_observation(_row=specimen_row, patient=None,
                                                                            official_focus="Specimen",
                                                                            focus=[Reference(**{
@@ -735,10 +846,15 @@ def get_specimen_patient(self, _row) -> str:
                                                                            patient_id=specimen_participant_id,
                                                                            specimen=specimen, components=None,
                                                                            category=specimen_observation_category)
-            # print(specimen_observation.component)
             if specimen_observation:
                 observations.append(specimen_observation)
 
+    document_references = []
+    for document_reference_index, document_reference_row in files_drs_meta.iterrows():
+        docref = documentreference_transformer.create_document_reference(_row=document_reference_row)
+        if docref:
+            document_references.append(docref)
+
     transformer.write_ndjson(research_subjects)
     transformer.write_ndjson(research_studies)
     transformer.write_ndjson(patients)
@@ -746,6 +862,7 @@ def get_specimen_patient(self, _row) -> str:
     transformer.write_ndjson(conditions)
     transformer.write_ndjson(observations)
     transformer.write_ndjson(specimens)
+    transformer.write_ndjson(document_references)
 
     # participant ids from specimen identifiers
     # print(transformer.decipher_htan_id(htan_biospecimens["HTAN Biospecimen ID"][0]))
diff --git a/resources/htan_resources/files.json b/resources/htan_resources/files.json
index 8e31dec..6baa511 100644
--- a/resources/htan_resources/files.json
+++ b/resources/htan_resources/files.json
@@ -29,8 +29,8 @@
     ],
     "Level": [
         {
-            "fhir_map": "Observation.component",
-            "focus": "DocumentReference"
+            "fhir_map": "DocumentReference.category",
+            "focus": null
         }
     ],
     "Organ": [
@@ -72,7 +72,7 @@
     ],
     "HTAN Parent Biospecimen ID": [
         {
-            "fhir_map": "Specimen.parent",
+            "fhir_map": "Specimen.identifier",
             "focus": null
         }
     ],
@@ -352,7 +352,7 @@
     ],
     "Parent Biospecimen ID": [
         {
-            "fhir_map": "Specimen.parent",
+            "fhir_map": "Specimen.identifier",
             "focus": null
         }
     ],

From 47b70e56b67d247ae27bd02b810b5d9fef225e39 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Mon, 7 Oct 2024 12:47:17 -0700
Subject: [PATCH 13/24] specimen refernece

---
 fhirizer/htan2fhir.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index 8ab7860..f1069ac 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -663,7 +663,7 @@ def __init__(self, *args: Any, **kwargs: Any):
         self.create_observation = self.create_observation
         self.get_patient_id = self.get_patient_id
 
-    def create_document_reference(self, _row: pd.Series) -> DocumentReference:
+    def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> DocumentReference:
         """Transform HTAN files to FHIR DocumentReference"""
 
         document_reference_identifier = Identifier(
@@ -709,7 +709,8 @@ def create_document_reference(self, _row: pd.Series) -> DocumentReference:
             specimen_id = self.mint_id(identifier=specimen_identifier, resource_type="Specimen",
                                        project_id=self.project_id,
                                        namespace=self.NAMESPACE_HTAN)
-            based_on.append(Reference(**{"reference": f"Specimen/{specimen_id}"}))
+            if specimen_id in specimen_ids:
+                based_on.append(Reference(**{"reference": f"Specimen/{specimen_id}"}))
 
         security_label = []
         if not pd.isnull(_row['Data Access']):
@@ -734,7 +735,7 @@ def create_document_reference(self, _row: pd.Series) -> DocumentReference:
                                     "identifier": [document_reference_identifier, document_reference_synapse_identifier],
                                     "status": "current",
                                     "docStatus": "final",
-                                    # "basedOn": based_on, # TODO: requires check for specimen - missing data
+                                    "basedOn": based_on,
                                     "subject": subject,
                                     # "relatesTo": parent_data_file,  # TODO: requires check for file - missing data
                                     "category": category,
@@ -749,9 +750,10 @@ def create_document_reference(self, _row: pd.Series) -> DocumentReference:
 # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
 # 12/14 total Atlas
 atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"]
-
+# atlas_name = ["OHSU"]
 for name in atlas_name:
-    # print(name)
+    print(f"Processing HTAN atlas {name}")
+    
     transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
     patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
                                              verbose=False)
@@ -849,9 +851,10 @@ def create_document_reference(self, _row: pd.Series) -> DocumentReference:
             if specimen_observation:
                 observations.append(specimen_observation)
 
+    specimen_ids = [s.id for s in specimens]
     document_references = []
     for document_reference_index, document_reference_row in files_drs_meta.iterrows():
-        docref = documentreference_transformer.create_document_reference(_row=document_reference_row)
+        docref = documentreference_transformer.create_document_reference(_row=document_reference_row, specimen_ids=specimen_ids)
         if docref:
             document_references.append(docref)
 

From de69ad166cbcb7a89dcba8ac0fffc39693d6a88b Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Mon, 7 Oct 2024 13:39:04 -0700
Subject: [PATCH 14/24] observations w focus document reference

---
 fhirizer/htan2fhir.py | 129 +++++++++++++++++++++++++++++-------------
 1 file changed, 90 insertions(+), 39 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index f1069ac..1aed72d 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -60,6 +60,18 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
         self.NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, self.SYSTEM_HTAN)
         self.read_json = utils._read_json
         self.fhir_ndjson = utils.fhir_ndjson
+        self.lab_category = [
+            {
+                "coding": [
+                    {
+                        "system": "http://terminology.hl7.org/CodeSystem/observation-category",
+                        "code": "laboratory",
+                        "display": "laboratory"
+                    }
+                ],
+                "text": "Laboratory"
+            }
+        ]
 
         parent_researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": "HTAN"})
         parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier,
@@ -130,7 +142,8 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
 
         # combine and create standard fhir files metadata
         # print(self.files["Filename"].str.split('/')[1])
-        self.files = self.files[self.files["Filename"].str.contains('.')] # NOTE: HTAPP contains file names ex. HTA1_982_7629309080080, that do not have any metadata
+        self.files = self.files[self.files["Filename"].str.contains(
+            '.')]  # NOTE: HTAPP contains file names ex. HTA1_982_7629309080080, that do not have any metadata
         self.files = self.files[self.files["Filename"].str.contains('/')]
 
         self.files['mime_type'] = self.files["Filename"].apply(lambda x: mimetypes.guess_type(x)[0])
@@ -254,10 +267,10 @@ def decipher_htan_id(_id) -> dict:
             deciphered_id = {"participant_id": participant_id, "subsets": _id_substrings}
         return deciphered_id
 
-    def create_observation(self, _row: pd.Series, patient: Optional[Patient], patient_id: str,
+    def create_observation(self, _row: pd.Series, patient: Optional[Patient], patient_id: Optional[str],
                            specimen: Optional[Specimen], official_focus: str,
-                           focus: List[Reference], components: Optional[List], category: Optional[list]) -> Observation:
-        assert patient_id, f"Observation is missing patient id: {patient_id}."
+                           focus: List[Reference], components: Optional[List], category: Optional[list], relax : bool) -> Observation:
+        # assert patient_id, f"Observation is missing patient id: {patient_id}." # HTAN files doesn't always point to patient
         assert focus, f"Observation for patient {patient_id} is missing focus."
 
         if not category:
@@ -276,29 +289,41 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien
 
         observation_fields = []
 
-        if official_focus not in ["Specimen"]:
+        if official_focus in ["Patient", "Condition"]:
             mappings = transformer.cases_mappings()
+            code = {
+                "coding": [
+                    {
+                        "system": "http://loinc.org",
+                        "code": "75323-6",
+                        "display": "Condition"
+                    }
+                ],
+                "text": "Condition"
+            }
+        elif official_focus in ["DocumentReference"]:
+            mappings = transformer.files_mappings()
             code = {
                 "coding": [
                     {
                         "system": self.SYSTEM_LOINC,
                         "code": "68992-7",
-                        "display": "Specimen-related information panel"
+                        "display": "Specimen-related information panel" #TODO: find general code
                     }
                 ],
                 "text": "Specimen-related information panel"
             }
-        else:
+        elif official_focus in ["Specimen"]:
             mappings = transformer.biospecimen_mappings()
             code = {
                 "coding": [
                     {
-                        "system": "http://loinc.org",
-                        "code": "75323-6",
-                        "display": "Condition"
+                        "system": self.SYSTEM_LOINC,
+                        "code": "68992-7",
+                        "display": "Specimen-related information panel"
                     }
                 ],
-                "text": "Condition"
+                "text": "Specimen-related information panel"
             }
 
         for _field, _fhir_map, _use, _focus in self.get_fields_by_fhir_map(mappings,
@@ -306,7 +331,10 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien
             if _focus == official_focus:
                 observation_fields.append(_field)
 
-        _obervation_row = _row[observation_fields] if observation_fields else None
+        if not relax:
+            _obervation_row = _row[observation_fields] if observation_fields else None
+        else:
+            _obervation_row = _row # user-specific columns in files - add all to component
 
         if _obervation_row is not None:
             components = []
@@ -339,6 +367,10 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien
         specimen_ref = None
         if specimen:
             specimen_ref = Reference(**{"reference": f"Specimen/{specimen.id}"})
+
+        subject = None
+        if patient_id:
+            subject = Reference(**{"reference": f"Patient/{patient_id}"})
         # add valueCodeableConcept as needed after creation
         return Observation(**{"id": observation_id,
                               "identifier": [observation_identifier],
@@ -346,7 +378,7 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien
                               "category": category,
                               "code": code,
                               "focus": focus,
-                              "subject": Reference(**{"reference": f"Patient/{patient_id}"}),
+                              "subject": subject,
                               "component": components,
                               "specimen": specimen_ref})
 
@@ -694,9 +726,11 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
 
         category = []
         if not pd.isnull(_row['Assay']):
-            category.append(CodeableConcept(**{"coding": [{"code": _row['Assay'], "display": _row['Assay'], "system": "/".join([self.SYSTEM_HTAN, "Assay"])}]}))
+            category.append(CodeableConcept(**{"coding": [
+                {"code": _row['Assay'], "display": _row['Assay'], "system": "/".join([self.SYSTEM_HTAN, "Assay"])}]}))
         if not pd.isnull(_row['Level']):
-            category.append(CodeableConcept(**{"coding": [{"code": _row['Level'], "display": _row['Level'], "system": "/".join([self.SYSTEM_HTAN, "Level"])}]}))
+            category.append(CodeableConcept(**{"coding": [
+                {"code": _row['Level'], "display": _row['Level'], "system": "/".join([self.SYSTEM_HTAN, "Level"])}]}))
 
         subject = None
         if patient_id:
@@ -714,7 +748,9 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
 
         security_label = []
         if not pd.isnull(_row['Data Access']):
-            security_label.append(CodeableConcept(**{"coding": [{"code":_row['Data Access'], "display": _row['Data Access'], "system": "/".join([self.SYSTEM_HTAN, "Data_Access"])}]}))
+            security_label.append(CodeableConcept(**{"coding": [
+                {"code": _row['Data Access'], "display": _row['Data Access'],
+                 "system": "/".join([self.SYSTEM_HTAN, "Data_Access"])}]}))
 
         parent_data_file = []
         if not pd.isnull(_row["Parent Data File ID"]):
@@ -722,8 +758,8 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
                 **{"system": self.SYSTEM_HTAN, "value": _row["Parent Data File ID"], "use": "official"})
 
             parent_document_reference_id = self.mint_id(identifier=parent_document_reference_identifier,
-                                                 resource_type="DocumentReference", project_id=self.project_id,
-                                                 namespace=self.NAMESPACE_HTAN)
+                                                        resource_type="DocumentReference", project_id=self.project_id,
+                                                        namespace=self.NAMESPACE_HTAN)
 
             parent_data_file.append(DocumentReferenceRelatesTo(**{
                 "code": CodeableConcept(**{"coding": [{"code": "parent_data_file",
@@ -732,7 +768,8 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
                 "target": Reference(**{"reference": f"Documentreference/{parent_document_reference_id}"})}))
 
         return DocumentReference(**{"id": document_reference_id,
-                                    "identifier": [document_reference_identifier, document_reference_synapse_identifier],
+                                    "identifier": [document_reference_identifier,
+                                                   document_reference_synapse_identifier],
                                     "status": "current",
                                     "docStatus": "final",
                                     "basedOn": based_on,
@@ -741,7 +778,8 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
                                     "category": category,
                                     "securityLabel": security_label,
                                     "content": [DocumentReferenceContent(
-                                        **{"attachment": Attachment(**{"title": name, "contentType": _row["mime_type"]}),
+                                        **{"attachment": Attachment(
+                                            **{"title": name, "contentType": _row["mime_type"]}),
                                            "profile": profiles
                                            })]
                                     })
@@ -749,11 +787,12 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
 
 # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
 # 12/14 total Atlas
-atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA", "Vanderbilt"]
+atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA",
+              "Vanderbilt"]
 # atlas_name = ["OHSU"]
 for name in atlas_name:
-    print(f"Processing HTAN atlas {name}")
-    
+    print(f"Transforming {name}")
+
     transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
     patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
                                              verbose=False)
@@ -812,7 +851,8 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
                                                                                        focus=[Reference(**{
                                                                                            "reference": f"Condition/{condition.id}"})],
                                                                                        specimen=None, components=None,
-                                                                                       category=None)
+                                                                                       category=None,
+                                                                                       relax=False)
                         if condition_observation:
                             observations.append(condition_observation)
 
@@ -823,19 +863,6 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
         if specimen:
             specimens.append(specimen)
 
-            specimen_observation_category = [
-                {
-                    "coding": [
-                        {
-                            "system": "http://terminology.hl7.org/CodeSystem/observation-category",
-                            "code": "laboratory",
-                            "display": "laboratory"
-                        }
-                    ],
-                    "text": "Laboratory"
-                }
-            ]
-
             participant_id = specimen_transformer.decipher_htan_id(specimen_row["HTAN Biospecimen ID"])[
                 "participant_id"]
             assert participant_id, f"Specimen {specimen_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it."
@@ -847,17 +874,41 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
                                                                                "reference": f"Specimen/{specimen.id}"})],
                                                                            patient_id=specimen_participant_id,
                                                                            specimen=specimen, components=None,
-                                                                           category=specimen_observation_category)
+                                                                           category=transformer.lab_category,
+                                                                           relax=False)
             if specimen_observation:
                 observations.append(specimen_observation)
 
     specimen_ids = [s.id for s in specimens]
+    patient_ids = [p.id for p in patients]
     document_references = []
     for document_reference_index, document_reference_row in files_drs_meta.iterrows():
-        docref = documentreference_transformer.create_document_reference(_row=document_reference_row, specimen_ids=specimen_ids)
+        docref = documentreference_transformer.create_document_reference(_row=document_reference_row,
+                                                                         specimen_ids=specimen_ids)
         if docref:
             document_references.append(docref)
 
+            docref_patient_id = None
+            if 'HTAN Participant ID' in document_reference_row.keys() and pd.isnull(document_reference_row['HTAN Participant ID']):
+                docref_patient = documentreference_transformer.get_patient_id(participant_id=document_reference_row['HTAN Participant ID'])
+                if docref_patient in patient_ids:
+                    docref_patient_id = docref_patient
+            # else:
+            #    print(f"HTAN {name} is missing patient reference in files")
+
+            document_reference_observation = documentreference_transformer.create_observation(
+                _row=document_reference_row, patient=None,
+                official_focus="DocumentReference",
+                focus=[Reference(**{
+                    "reference": f"DocumentReference/{docref.id}"})],
+                patient_id=docref_patient_id,
+                specimen=None, components=None,
+                category=transformer.lab_category,
+                relax=True)
+
+            if document_reference_observation:
+                observations.append(document_reference_observation)
+
     transformer.write_ndjson(research_subjects)
     transformer.write_ndjson(research_studies)
     transformer.write_ndjson(patients)

From 44b55f94aee030e5d8905379bdbb70e603235b97 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Tue, 8 Oct 2024 07:25:22 -0700
Subject: [PATCH 15/24] ChEMBL query

---
 fhirizer/utils.py | 88 +++++++++++++++++++++++++++++++++++++++--------
 setup.py          |  1 +
 2 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/fhirizer/utils.py b/fhirizer/utils.py
index 92f1744..16513ac 100644
--- a/fhirizer/utils.py
+++ b/fhirizer/utils.py
@@ -2,6 +2,7 @@
 import orjson
 import time
 import random
+import sqlite3
 import json
 import glob
 import gzip
@@ -17,8 +18,10 @@
 from fhir.resources import get_fhir_model_class
 from uuid import uuid5, UUID
 
-DATA_DICT_PATH = "".join([str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'data_dictionary')), "/"])
-FIELDS_PATH = "".join([str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'fields')), "/"])
+DATA_DICT_PATH = "".join(
+    [str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'data_dictionary')), "/"])
+FIELDS_PATH = "".join(
+    [str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'fields')), "/"])
 package_dir = Path(importlib.resources.files('fhirizer').parent)
 
 
@@ -205,6 +208,7 @@ def _read_json(path):
     except json.JSONDecodeError as e:
         print("Error decoding JSON: {}".format(e))
 
+
 # --------------------------------------------------------------------------
 # GDC Utility functions
 # --------------------------------------------------------------------------
@@ -740,7 +744,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose):
                         if shared_keys:
                             shared_keys_items = next(iter(shared_keys))
                             if verbose:
-                                print(f"======== instance Dict {target_key} ============== case C", "shared_keys: ", shared_keys)
+                                print(f"======== instance Dict {target_key} ============== case C", "shared_keys: ",
+                                      shared_keys)
 
                             if isinstance(data[key][0][shared_keys_items], str) and isinstance(
                                     data_to_append[shared_keys_items], str) and data[key][0][shared_keys_items] != \
@@ -752,7 +757,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose):
                                         print("Specimen.id" in item.keys())
                                         print(len(item.keys()))
 
-                                    if len(item.keys()) == 1 and "Specimen.id" in list(item.keys())[0] and data_to_append.keys() != item.keys():
+                                    if len(item.keys()) == 1 and "Specimen.id" in list(item.keys())[
+                                        0] and data_to_append.keys() != item.keys():
                                         # this is where metadata is updated if the head key with Specimen.id exists
                                         item.update(data_to_append)
                                         reached = True
@@ -762,7 +768,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose):
                                     # this is where first Specimen.id is appended
                                     data[key].append(data_to_append)
                                 if verbose:
-                                    print(f"======== instance Dict {target_key} ============== case D AFTER", "data[key]: ", data[key], "\n\n")
+                                    print(f"======== instance Dict {target_key} ============== case D AFTER",
+                                          "data[key]: ", data[key], "\n\n")
                                 continue
 
                             elif isinstance(data[key][0][shared_keys_items], str) and isinstance(
@@ -799,7 +806,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose):
                                         d.update(data_to_append)
                                         continue
                                 if verbose:
-                                    print(f"======== instance Dict {target_key} ============== case F After", "data[key]: ", data[key], "\n\n")
+                                    print(f"======== instance Dict {target_key} ============== case F After",
+                                          "data[key]: ", data[key], "\n\n")
                                 continue
 
                             elif (isinstance(data[key][0][shared_keys_items], list) and isinstance(
@@ -807,7 +815,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose):
                                   not data[key][0][shared_keys_items][0].items() <= data_to_append[shared_keys_items][
                                       0].items()):
                                 if verbose:
-                                    print(f"======== instance Dict {target_key} ============== case G", "data[key]: ", data[key])
+                                    print(f"======== instance Dict {target_key} ============== case G", "data[key]: ",
+                                          data[key])
 
                         if data[key][0]:
                             if len(data[key]) > 1 and len(data[key][-1]) == 1:
@@ -820,7 +829,8 @@ def append_data_to_key(data, target_key, data_to_append, verbose):
                                         and not data_to_append.items() <= item.items()):
                                     item.update(data_to_append)
                                     if verbose:
-                                        print(f"======== instance Dict {target_key} ============== case H AFTER", "item: ", item, "\n\n")
+                                        print(f"======== instance Dict {target_key} ============== case H AFTER",
+                                              "item: ", item, "\n\n")
                                     continue
 
                         elif (data[key] and key == "samples"
@@ -967,8 +977,8 @@ def make_request(api_url, retries=3):
             return response.json()
         else:
             print(f"Received status code: {response.status_code}. Retrying...")
-            delay *= 2 ** retries # change delay
-            time.sleep(delay + random.uniform(0, 1)) # add jitter
+            delay *= 2 ** retries  # change delay
+            time.sleep(delay + random.uniform(0, 1))  # add jitter
     raise Exception("Failed to fetch data after multiple retries")
 
 
@@ -982,7 +992,8 @@ def fetch_cellines(cellosaurus_ids, out_dir):
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
 
-    existing_ids = set(os.path.splitext(os.path.basename(file))[0] for file in os.listdir(out_dir) if file.endswith('.json'))
+    existing_ids = set(
+        os.path.splitext(os.path.basename(file))[0] for file in os.listdir(out_dir) if file.endswith('.json'))
     to_fetch_ids = set(cellosaurus_ids) - existing_ids
 
     for cellosaurus_id in to_fetch_ids:
@@ -1038,8 +1049,8 @@ def cellosaurus_cancer_ids(path, out_path, save=False):
     # has sex annotation
     for celline in cl_cancer_depmap:
         for subset in celline["subset"]:
-                if subset in ["Female", "Male"]:
-                    ids.append(celline["id"][0])
+            if subset in ["Female", "Male"]:
+                ids.append(celline["id"][0])
 
     # 67763 cell-lines
     # 62019 cell-lines w gender
@@ -1090,7 +1101,8 @@ def get_data_types(data_type):
         return data_type
 
 
-def get_component(key, value=None, component_type=None, system="https://cadsr.cancer.gov/sample_laboratory_observation"):
+def get_component(key, value=None, component_type=None,
+                  system="https://cadsr.cancer.gov/sample_laboratory_observation"):
     if component_type == 'string':
         value = {"valueString": value}
     elif component_type == 'int':
@@ -1188,4 +1200,50 @@ def create_or_extend(new_items, folder_path='META', resource_type='Observation',
         else:
             print(f"{file_name} has been extended, without updating existing data.")
     else:
-        print(f"{file_name} has been created.")
\ No newline at end of file
+        print(f"{file_name} has been created.")
+
+
+def get_chembl_compound_info(db_file_path: str, drug_names: list, limit: int) -> list:
+    """Query Chembl COMPOUND_RECORDS by COMPOUND_NAME to make FHIR Substance"""
+    drug_names_tuple = tuple([x.upper() for x in drug_names])
+
+    query = f"""
+    SELECT 
+        a.MOLREGNO,
+        a.PREF_NAME,
+        a.CHEMBL_ID,
+        a.MAX_PHASE,
+        a.STRUCTURE_TYPE,
+        c.STANDARD_INCHI,
+        c.STANDARD_INCHI_KEY,
+        c.CANONICAL_SMILES,
+        d.DOC_ID,
+        d.PUBMED_ID,
+        d.DOI,
+        cr.SRC_ID,
+        cr.SRC_COMPOUND_ID, 
+        sr.SRC_SHORT_NAME, 
+        sr.SRC_DESCRIPTION
+    FROM 
+        MOLECULE_DICTIONARY as a
+    LEFT JOIN 
+        COMPOUND_STRUCTURES as c ON a.MOLREGNO = c.MOLREGNO
+    LEFT JOIN 
+        ACTIVITIES as p ON a.MOLREGNO = p.MOLREGNO
+    LEFT JOIN 
+        DOCS as d ON p.DOC_ID = d.DOC_ID
+    LEFT JOIN 
+        compound_records as cr ON a.MOLREGNO = cr.MOLREGNO
+    LEFT JOIN
+        source as sr ON cr.SRC_ID = sr.SRC_ID
+    WHERE cr.COMPOUND_NAME IN {drug_names_tuple}
+    LIMIT {limit};
+    """
+    conn = sqlite3.connect(db_file_path)
+    cursor = conn.cursor()
+    cursor.execute(query)
+    rows = cursor.fetchall()
+
+    conn.close()
+
+    return rows
diff --git a/setup.py b/setup.py
index 9190e4c..92aad84 100644
--- a/setup.py
+++ b/setup.py
@@ -31,6 +31,7 @@
         'inflection',
         'iteration_utilities',
         'icd10-cm',
+        'sqlite3',
         'beautifulsoup4',
         'gen3-tracker>=0.0.4rc36',
         'fhir.resources>=7.1.0'  # FHIR® (Release R5, version 5.0.0)

From 67677dcb9f9c5ff7f8d2d3339f6d27b6aaa24868 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Tue, 8 Oct 2024 07:27:18 -0700
Subject: [PATCH 16/24] remove lib

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 92aad84..9190e4c 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,6 @@
         'inflection',
         'iteration_utilities',
         'icd10-cm',
-        'sqlite3',
         'beautifulsoup4',
         'gen3-tracker>=0.0.4rc36',
         'fhir.resources>=7.1.0'  # FHIR® (Release R5, version 5.0.0)

From eb869764720ef920a5473f6026935153c675a59a Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Tue, 8 Oct 2024 10:53:56 -0700
Subject: [PATCH 17/24] initial htan medadmin and med

---
 fhirizer/htan2fhir.py | 160 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 140 insertions(+), 20 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index 1aed72d..ae75619 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -26,6 +26,7 @@
 from fhir.resources.observation import Observation
 from fhir.resources.encounter import Encounter
 from fhir.resources.codeableconcept import CodeableConcept
+from fhir.resources.codeablereference import CodeableReference
 from fhir.resources.age import Age
 from fhir.resources.procedure import Procedure
 from fhir.resources.bodystructure import BodyStructure, BodyStructureIncludedStructure
@@ -34,8 +35,11 @@
 from fhir.resources.documentreference import DocumentReference, DocumentReferenceContent, \
     DocumentReferenceContentProfile, DocumentReferenceRelatesTo
 from fhir.resources.attachment import Attachment
+from fhir.resources.timing import Timing
 from fhir.resources.medicationadministration import MedicationAdministration
-from fhir.resources.medication import Medication
+from fhir.resources.medication import Medication, MedicationIngredient
+from fhir.resources.substance import Substance, SubstanceIngredient
+from fhir.resources.substancedefinition import SubstanceDefinition, SubstanceDefinitionStructure
 
 
 # File data on synapse after authentication
@@ -49,6 +53,7 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
         self.get_data_type = utils.get_data_types
         self.get_component = utils.get_component
         self.fhir_ndjson = utils.fhir_ndjson
+        self.get_chembl_compound_info = utils.get_chembl_compound_info
         self.subprogram_name = subprogram_name
         self.project_id = subprogram_name  # incase there will be more granular project/program relations
         assert Path(out_dir).is_dir(), f"Path to out_dir {out_dir} is not a directory."
@@ -72,7 +77,16 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
                 "text": "Laboratory"
             }
         ]
-
+        self.med_admin_code = {
+                "coding": [
+                  {
+                    "system": "http://loinc.org",
+                    "code": "80565-5",
+                    "display": "Medication administration record"
+                  }
+                ],
+                "text": "Medication administration record"
+        }
         parent_researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": "HTAN"})
         parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier,
                                                resource_type="ResearchStudy",
@@ -269,7 +283,8 @@ def decipher_htan_id(_id) -> dict:
 
     def create_observation(self, _row: pd.Series, patient: Optional[Patient], patient_id: Optional[str],
                            specimen: Optional[Specimen], official_focus: str,
-                           focus: List[Reference], components: Optional[List], category: Optional[list], relax : bool) -> Observation:
+                           focus: List[Reference], components: Optional[List], category: Optional[list],
+                           relax: bool) -> Observation:
         # assert patient_id, f"Observation is missing patient id: {patient_id}." # HTAN files doesn't always point to patient
         assert focus, f"Observation for patient {patient_id} is missing focus."
 
@@ -301,6 +316,11 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien
                 ],
                 "text": "Condition"
             }
+
+        elif official_focus in ["MedicationAdministration"]:
+            mappings = transformer.cases_mappings()
+            code = self.med_admin_code
+
         elif official_focus in ["DocumentReference"]:
             mappings = transformer.files_mappings()
             code = {
@@ -308,11 +328,12 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien
                     {
                         "system": self.SYSTEM_LOINC,
                         "code": "68992-7",
-                        "display": "Specimen-related information panel" #TODO: find general code
+                        "display": "Specimen-related information panel"  # TODO: find general code
                     }
                 ],
                 "text": "Specimen-related information panel"
             }
+
         elif official_focus in ["Specimen"]:
             mappings = transformer.biospecimen_mappings()
             code = {
@@ -334,7 +355,7 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien
         if not relax:
             _obervation_row = _row[observation_fields] if observation_fields else None
         else:
-            _obervation_row = _row # user-specific columns in files - add all to component
+            _obervation_row = _row  # user-specific columns in files - add all to component
 
         if _obervation_row is not None:
             components = []
@@ -628,8 +649,73 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
                             "stage": [],
                             })
 
-    def create_medication_administration(self) -> MedicationAdministration:
-        return MedicationAdministration(**{})
+    def create_medication_administration(self, _row: pd.Series, patient_id: str) -> dict:
+        # if Treatment Type exists - make MedicationAdministration
+        # if Days to Treatment End, then status -> completed, else status unknown
+        # if Therapeutic Agents is null, then Medication.code -> snomed_code: Unknown 261665006
+        # Medication.ingredient.item -> Substance.code -> SubstanceDefination
+
+        status = None
+        substance_definition = None
+        substance = None
+        medication = None
+        medication_code = None
+
+        if not pd.isnull(_row["Days to Treatment End"]):
+            status = "completed"
+        else:
+            status = "unknown"
+
+        if pd.isnull(_row["Therapeutic Agents"]):
+            medication_code = CodeableConcept(**{"coding": [{
+                "code": "261665006",
+                "system": self.SYSTEM_SNOME,
+                "display": "Unknown"
+            }]})
+        else:
+            # drug_info_df = pd.DataFrame(self.get_chembl_compound_info(db_file_path="./reources/chemble/chembl_34.db", drug_names=list(_row["Theraputic Agent"])))
+            medication_code = CodeableConcept(**{"coding": [{"code": _row["Therapeutic Agents"],
+                                                             "system": self.SYSTEM_HTAN,
+                                                             "display": _row["Therapeutic Agents"]}]})
+
+        timing = 0
+        if not pd.isnull(_row["Days to Treatment End"]) and not pd.isnull(_row["Days to Treatment Start"]):
+            timing = int(_row["Days to Treatment End"]) - int(_row["Days to Treatment Start"])
+
+        # TODO: replace with chembl
+        # substance_definition = SubstanceDefinition(**{})
+        # substance = Substance(**{})
+
+        medication_identifier = Identifier(
+            **{"system": self.SYSTEM_HTAN, "use": "official",
+               "value": medication_code.coding[0].display})
+        medication_id = self.mint_id(identifier=medication_identifier,
+                                     resource_type="Medication",
+                                     project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+
+        medication = Medication(**{"id": medication_id, "identifier": [medication_identifier], "code": medication_code})
+
+        medication_admin_identifier = Identifier(
+            **{"system": self.SYSTEM_HTAN, "use": "official",
+               "value": "-".join([_row["Atlas Name"], _row["HTAN Participant ID"], _row["Treatment Type"]])})
+        medication_admin_id = self.mint_id(identifier=medication_admin_identifier,
+                                           resource_type="MedicationAdministration",
+                                           project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+        data = {"id": medication_admin_id,
+                "identifier": [medication_admin_identifier],
+                "status": status,
+                "occurenceDateTime": "2024-10-8T10:30:00.724446-05:00",
+                "category": [CodeableConcept(**{"coding": [{"code": _row["Treatment Type"],
+                                                            "system": "/".join([self.SYSTEM_HTAN,"Treatment_Type"]) ,
+                                                            "display": _row["Treatment Type"]}]})],
+                "medication": CodeableReference(**{"concept": medication_code, "reference": Reference(
+                    **{"reference": f"Medication/{medication.id}"})}),
+                "subject": Reference(**{"reference": f"Patient/{patient_id}"})}
+        medication_admin = MedicationAdministration(**data)
+
+        return {"medication_admin": medication_admin,
+                "medication": medication, "substance": substance,
+                "substance_definition": substance_definition}
 
 
 class SpecimenTransformer(HTANTransformer):
@@ -780,15 +866,15 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
                                     "content": [DocumentReferenceContent(
                                         **{"attachment": Attachment(
                                             **{"title": name, "contentType": _row["mime_type"]}),
-                                           "profile": profiles
-                                           })]
+                                            "profile": profiles
+                                        })]
                                     })
 
 
 # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
 # 12/14 total Atlas
 atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA",
-              "Vanderbilt"]
+               "Vanderbilt"]
 # atlas_name = ["OHSU"]
 for name in atlas_name:
     print(f"Transforming {name}")
@@ -814,6 +900,8 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
     conditions = []
     encounters = []
     observations = []
+    med_admins = []
+    med = []
     for index, row in cases.iterrows():
         research_study = patient_transformer.create_researchstudy(_row=row)
 
@@ -856,6 +944,24 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
                         if condition_observation:
                             observations.append(condition_observation)
 
+                if not pd.isnull(row["Treatment Type"]):
+                    med_admin_dict = patient_transformer.create_medication_administration(_row=row,
+                                                                                          patient_id=patient.id)
+                    if med_admin_dict["medication_admin"]:
+                        med_admins.append(med_admin_dict["medication_admin"])
+                        med_admin_observation = patient_transformer.create_observation(_row=row, patient=None,
+                                                                                       official_focus="MedicationAdministration",
+                                                                                       focus=[Reference(**{
+                                                                                           "reference": f"MedicationAdministration/{med_admin_dict["medication_admin"].id}"})],
+                                                                                       patient_id=patient.id,
+                                                                                       specimen=None, components=None,
+                                                                                       category=None,
+                                                                                       relax=False)
+                        if med_admin_observation:
+                            observations.append(med_admin_observation)
+                    if med_admin_dict["medication"]:
+                        med.append(med_admin_dict["medication"])
+
     specimens = []
     for specimen_index, specimen_row in htan_biospecimens.iterrows():
         # specimen_row = htan_biospecimens.iloc[specimen_index]
@@ -889,8 +995,10 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
             document_references.append(docref)
 
             docref_patient_id = None
-            if 'HTAN Participant ID' in document_reference_row.keys() and pd.isnull(document_reference_row['HTAN Participant ID']):
-                docref_patient = documentreference_transformer.get_patient_id(participant_id=document_reference_row['HTAN Participant ID'])
+            if 'HTAN Participant ID' in document_reference_row.keys() and pd.isnull(
+                    document_reference_row['HTAN Participant ID']):
+                docref_patient = documentreference_transformer.get_patient_id(
+                    participant_id=document_reference_row['HTAN Participant ID'])
                 if docref_patient in patient_ids:
                     docref_patient_id = docref_patient
             # else:
@@ -909,14 +1017,26 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
             if document_reference_observation:
                 observations.append(document_reference_observation)
 
-    transformer.write_ndjson(research_subjects)
-    transformer.write_ndjson(research_studies)
-    transformer.write_ndjson(patients)
-    transformer.write_ndjson(encounters)
-    transformer.write_ndjson(conditions)
-    transformer.write_ndjson(observations)
-    transformer.write_ndjson(specimens)
-    transformer.write_ndjson(document_references)
+    if research_subjects:
+        transformer.write_ndjson(research_subjects)
+    if research_studies:
+        transformer.write_ndjson(research_studies)
+    if patients:
+        transformer.write_ndjson(patients)
+    if encounters:
+        transformer.write_ndjson(encounters)
+    if conditions:
+        transformer.write_ndjson(conditions)
+    if observations:
+        transformer.write_ndjson(observations)
+    if specimens:
+        transformer.write_ndjson(specimens)
+    if document_references:
+        transformer.write_ndjson(document_references)
+    if med_admins:
+        transformer.write_ndjson(med_admins)
+    if med:
+        transformer.write_ndjson(med)
 
     # participant ids from specimen identifiers
     # print(transformer.decipher_htan_id(htan_biospecimens["HTAN Biospecimen ID"][0]))

From bac208473bd5bb8ef42310476af381bf67121425 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Wed, 9 Oct 2024 09:28:19 -0700
Subject: [PATCH 18/24] medadmin - med - substance - substancedefinition

---
 fhirizer/htan2fhir.py | 202 +++++++++++++++++++++++-----
 fhirizer/utils.py     |  17 +--
 scripts/gdc_scan.py   | 297 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 468 insertions(+), 48 deletions(-)
 create mode 100644 scripts/gdc_scan.py

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index ae75619..5fe4846 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -39,7 +39,7 @@
 from fhir.resources.medicationadministration import MedicationAdministration
 from fhir.resources.medication import Medication, MedicationIngredient
 from fhir.resources.substance import Substance, SubstanceIngredient
-from fhir.resources.substancedefinition import SubstanceDefinition, SubstanceDefinitionStructure
+from fhir.resources.substancedefinition import SubstanceDefinition,SubstanceDefinitionStructure,  SubstanceDefinitionStructureRepresentation, SubstanceDefinitionName
 
 
 # File data on synapse after authentication
@@ -62,6 +62,7 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
         self.SYSTEM_HTAN = 'https://data.humantumoratlas.org'
         self.SYSTEM_SNOME = 'http://snomed.info/sct'
         self.SYSTEM_LOINC = 'http://loinc.org'
+        self.SYSTEM_chEMBL = 'https://www.ebi.ac.uk/chembl'
         self.NAMESPACE_HTAN = uuid3(NAMESPACE_DNS, self.SYSTEM_HTAN)
         self.read_json = utils._read_json
         self.fhir_ndjson = utils.fhir_ndjson
@@ -408,6 +409,87 @@ def get_patient_id(self, participant_id) -> str:
         patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id,
                                   namespace=self.NAMESPACE_HTAN)
         return patient_id
+    @staticmethod
+    def create_substance_definition_representations(df: pd.DataFrame) -> list:
+        representations = []
+        for index, _row in df.iterrows():
+            if pd.notna(_row['STANDARD_INCHI']):
+                representations.append(SubstanceDefinitionStructureRepresentation(
+                    **{"representation": _row['STANDARD_INCHI'],
+                       "format": CodeableConcept(**{"coding": [{"code": "InChI",
+                                                                "system": 'http://hl7.org/fhir/substance-representation-format',
+                                                                "display": "InChI"}]})}))
+
+            if pd.notna(_row['CANONICAL_SMILES']):
+                representations.append(SubstanceDefinitionStructureRepresentation(
+                    **{"representation": _row['CANONICAL_SMILES'],
+                       "format": CodeableConcept(**{"coding": [{"code": "SMILES",
+                                                                "system": 'http://hl7.org/fhir/substance-representation-format',
+                                                                "display": "SMILES"}]})}))
+        return representations
+
+    def create_substance_definition(self, compound_name: str, representations: list) -> SubstanceDefinition:
+        sub_def_identifier = Identifier(**{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"})
+        sub_def_id = self.mint_id(identifier=sub_def_identifier, resource_type="SubstanceDefinition", project_id=self.project_id,
+                                  namespace=self.NAMESPACE_HTAN)
+
+        return SubstanceDefinition(**{"id": sub_def_id,
+                                      "identifier": [sub_def_identifier],
+                                      "structure": SubstanceDefinitionStructure(**{"representation": representations}),
+                                      "name": [SubstanceDefinitionName(**{"name": compound_name})]
+                                      })
+
+    def create_substance(self, compound_name:str, substance_definition: SubstanceDefinition) -> Substance:
+        code = None
+        if substance_definition:
+            code = CodeableReference(
+                **{"concept": CodeableConcept(**{"coding": [{"code": compound_name, "system": "/".join([self.SYSTEM_chEMBL, "compound_name"]), "display": compound_name}]}),
+                   "reference": Reference(**{"reference": f"SubstanceDefinition/{substance_definition.id}"})})
+
+        sub_identifier = Identifier(
+            **{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"})
+        sub_id = self.mint_id(identifier=sub_identifier, resource_type="Substance",
+                                  project_id=self.project_id,
+                                  namespace=self.NAMESPACE_HTAN)
+
+        return Substance(**{"id": sub_id,
+                            "identifier": [sub_identifier],
+                            "instance": True,  # place-holder
+                            "category": [CodeableConcept(**{"coding": [{"code": "drug",
+                                                                        "system": "http://terminology.hl7.org/CodeSystem/substance-category",
+                                                                        "display": "Drug or Medicament"}]})],
+                            "code": code})
+
+    def create_medication(self, compound_name: Optional[str], treatment_type: Optional[str], _substance: Optional[Substance]) -> Medication:
+        code = None
+        med_identifier = None
+        if compound_name:
+            code = CodeableConcept(**{"coding": [
+                {"code": compound_name, "system": "/".join([self.SYSTEM_chEMBL, "compound_name"]),
+                 "display": compound_name}]})
+
+            med_identifier = Identifier(
+                **{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"})
+        else:
+            code = CodeableConcept(**{"coding": [
+                {"code": treatment_type, "system": "/".join([self.SYSTEM_HTAN, "treatment_type"]),
+                 "display": treatment_type}]})
+
+            med_identifier = Identifier(
+                **{"system": self.SYSTEM_HTAN, "value": treatment_type, "use": "official"})
+
+        med_id = self.mint_id(identifier=med_identifier, resource_type="Medication",
+                                  project_id=self.project_id,
+                                  namespace=self.NAMESPACE_HTAN)
+
+        ingredients = []
+        if _substance:
+            ingredients.append(MedicationIngredient(**{"item": CodeableReference(**{"reference": Reference(**{"reference": f"Substance/{_substance.id}"})})}))
+
+        return Medication(**{"id": med_id,
+                             "identifier": [med_identifier],
+                             "code": code,
+                             "ingredient": ingredients})
 
     def write_ndjson(self, entities):
         resource_type = entities[0].resource_type
@@ -415,6 +497,68 @@ def write_ndjson(self, entities):
         entities = list({v['id']: v for v in entities}.values())
         utils.fhir_ndjson(entities, "".join([self.out_dir, "/", resource_type, ".ndjson"]))
 
+    def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.DataFrame:
+        # create medication placeholder for cases where treatment type is defined ex chemo, but medication is not documented
+        # MedicationAdministration - Medication - Substance - SubstanceDefinition
+        drugname_fhir_ids = {}
+        substance_definitions = []
+        substances = []
+        medications = []
+        if not cases["Therapeutic Agents"].isnull().all():
+            cases["Therapeutic Agents"] = cases["Therapeutic Agents"].str.upper()
+            drug_names = list(cases["Therapeutic Agents"][~cases["Therapeutic Agents"].isna()].unique())
+            # drug_names = [d.upper() for d in drug_names]
+            dat = self.get_chembl_compound_info(db_file_path=db_file_path, drug_names=drug_names, limit=1000)
+            drug_df = pd.DataFrame(dat)
+            drug_df.columns = ["CHEMBL_ID", "STANDARD_INCHI", "CANONICAL_SMILES", "COMPOUND_NAME"]
+
+            for drug in drug_names:
+                drug_info = drug_df[drug_df.COMPOUND_NAME.isin([drug])]
+                drug_info["has_info"] = drug_info[['STANDARD_INCHI', 'CANONICAL_SMILES']].notna().any(axis=1)
+                if drug_info["has_info"].any():
+                    drug_representations = self.create_substance_definition_representations(drug_info)
+                    substance_definition = self.create_substance_definition(compound_name=drug,
+                                                                                   representations=drug_representations)
+
+                    if substance_definition:
+                        substance_definitions.append(substance_definition)
+
+                    substance = self.create_substance(compound_name=drug, substance_definition=substance_definition)
+
+                    if substance:
+                        substances.append(substance)
+                        medication = self.create_medication(compound_name=drug, _substance=substance, treatment_type=None)
+                        if medication:
+                            medications.append(medication)
+                            drugname_fhir_ids.update({drug: medication.id})
+
+                else:
+                    medication = self.create_medication(compound_name=drug, _substance=None, treatment_type=None)
+                    medications.append(medication)
+                    drugname_fhir_ids.update({drug: medication.id})
+
+            if substance_definitions:
+                transformer.write_ndjson(substance_definitions)
+            if substances:
+                transformer.write_ndjson(substances)
+
+            cases['Medication_ID'] = cases['Therapeutic Agents'].map(drugname_fhir_ids, na_action='ignore')
+
+        for index, row in cases.iterrows():
+            if pd.isnull(row["Therapeutic Agents"]) and not pd.isnull(row["Treatment Type"]):
+                medication_agent = self.create_medication(compound_name=None, _substance=None, treatment_type=row["Treatment Type"])
+                if medication_agent:
+                    medications.append(medication_agent)
+                    cases.loc[index, 'Medication_ID'] = medication_agent.id
+
+            if row['Therapeutic Agents'] in drugname_fhir_ids.keys():
+                cases.loc[index, 'Medication_ID'] = drugname_fhir_ids[row['Therapeutic Agents']]
+
+        if medications:
+            transformer.write_ndjson(medications)
+        if 'Medication_ID' in cases.columns:
+            return cases
+
 
 class PatientTransformer(HTANTransformer):
     def __init__(self, *args: Any, **kwargs: Any):
@@ -649,7 +793,7 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
                             "stage": [],
                             })
 
-    def create_medication_administration(self, _row: pd.Series, patient_id: str) -> dict:
+    def create_medication_administration(self, _row: pd.Series, patient_id: str) -> MedicationAdministration:
         # if Treatment Type exists - make MedicationAdministration
         # if Days to Treatment End, then status -> completed, else status unknown
         # if Therapeutic Agents is null, then Medication.code -> snomed_code: Unknown 261665006
@@ -682,19 +826,6 @@ def create_medication_administration(self, _row: pd.Series, patient_id: str) ->
         if not pd.isnull(_row["Days to Treatment End"]) and not pd.isnull(_row["Days to Treatment Start"]):
             timing = int(_row["Days to Treatment End"]) - int(_row["Days to Treatment Start"])
 
-        # TODO: replace with chembl
-        # substance_definition = SubstanceDefinition(**{})
-        # substance = Substance(**{})
-
-        medication_identifier = Identifier(
-            **{"system": self.SYSTEM_HTAN, "use": "official",
-               "value": medication_code.coding[0].display})
-        medication_id = self.mint_id(identifier=medication_identifier,
-                                     resource_type="Medication",
-                                     project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
-
-        medication = Medication(**{"id": medication_id, "identifier": [medication_identifier], "code": medication_code})
-
         medication_admin_identifier = Identifier(
             **{"system": self.SYSTEM_HTAN, "use": "official",
                "value": "-".join([_row["Atlas Name"], _row["HTAN Participant ID"], _row["Treatment Type"]])})
@@ -706,16 +837,13 @@ def create_medication_administration(self, _row: pd.Series, patient_id: str) ->
                 "status": status,
                 "occurenceDateTime": "2024-10-8T10:30:00.724446-05:00",
                 "category": [CodeableConcept(**{"coding": [{"code": _row["Treatment Type"],
-                                                            "system": "/".join([self.SYSTEM_HTAN,"Treatment_Type"]) ,
+                                                            "system": "/".join([self.SYSTEM_HTAN, "Treatment_Type"]) ,
                                                             "display": _row["Treatment Type"]}]})],
                 "medication": CodeableReference(**{"concept": medication_code, "reference": Reference(
-                    **{"reference": f"Medication/{medication.id}"})}),
+                    **{"reference": f"Medication/{_row['Medication_ID']}"})}),
                 "subject": Reference(**{"reference": f"Patient/{patient_id}"})}
-        medication_admin = MedicationAdministration(**data)
 
-        return {"medication_admin": medication_admin,
-                "medication": medication, "substance": substance,
-                "substance_definition": substance_definition}
+        return MedicationAdministration(**data)
 
 
 class SpecimenTransformer(HTANTransformer):
@@ -873,9 +1001,14 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
 
 # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
 # 12/14 total Atlas
-atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford", "TNP_SARDANA",
-               "Vanderbilt"]
+# TNP_SARDANA drug name syntax error
+atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford",
+              "Vanderbilt"]
+
 # atlas_name = ["OHSU"]
+db_path = '../../bmeg_backup_0516/bmeg-etl_chembl/source/chembl/chembl_34/chembl_34_sqlite/chembl_34.db'
+
+
 for name in atlas_name:
     print(f"Transforming {name}")
 
@@ -901,7 +1034,10 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
     encounters = []
     observations = []
     med_admins = []
-    med = []
+
+    if not cases["Therapeutic Agents"].isnull().all() or not cases["Treatment Type"].isnull().all():
+        cases = transformer.transform_medication(cases, db_file_path=db_path)
+
     for index, row in cases.iterrows():
         research_study = patient_transformer.create_researchstudy(_row=row)
 
@@ -945,22 +1081,20 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
                             observations.append(condition_observation)
 
                 if not pd.isnull(row["Treatment Type"]):
-                    med_admin_dict = patient_transformer.create_medication_administration(_row=row,
-                                                                                          patient_id=patient.id)
-                    if med_admin_dict["medication_admin"]:
-                        med_admins.append(med_admin_dict["medication_admin"])
+                    med_admin = patient_transformer.create_medication_administration(_row=row,
+                                                                                     patient_id=patient.id)
+                    if med_admin:
+                        med_admins.append(med_admin)
                         med_admin_observation = patient_transformer.create_observation(_row=row, patient=None,
                                                                                        official_focus="MedicationAdministration",
                                                                                        focus=[Reference(**{
-                                                                                           "reference": f"MedicationAdministration/{med_admin_dict["medication_admin"].id}"})],
+                                                                                           "reference": f"MedicationAdministration/{med_admin.id}"})],
                                                                                        patient_id=patient.id,
                                                                                        specimen=None, components=None,
                                                                                        category=None,
                                                                                        relax=False)
                         if med_admin_observation:
                             observations.append(med_admin_observation)
-                    if med_admin_dict["medication"]:
-                        med.append(med_admin_dict["medication"])
 
     specimens = []
     for specimen_index, specimen_row in htan_biospecimens.iterrows():
@@ -1035,9 +1169,11 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
         transformer.write_ndjson(document_references)
     if med_admins:
         transformer.write_ndjson(med_admins)
-    if med:
-        transformer.write_ndjson(med)
+
 
     # participant ids from specimen identifiers
     # print(transformer.decipher_htan_id(htan_biospecimens["HTAN Biospecimen ID"][0]))
     # print(transformer.decipher_htan_id(cases["HTAN Participant ID"][0]))
+
+# make all possible medications in cases
+# point to the medication in MedicationAdministartion - Medication -> Substance -> SubstanceDefination 0...* [representaiton.format representation.string]
diff --git a/fhirizer/utils.py b/fhirizer/utils.py
index 16513ac..e236b2f 100644
--- a/fhirizer/utils.py
+++ b/fhirizer/utils.py
@@ -1208,30 +1208,17 @@ def get_chembl_compound_info(db_file_path: str, drug_names: list, limit: int) ->
     drug_names_tuple = tuple([x.upper() for x in drug_names])
 
     query = f"""
-    SELECT 
-        a.MOLREGNO,
-        a.PREF_NAME,
+    SELECT DISTINCT 
         a.CHEMBL_ID,
-        a.MAX_PHASE,
-        a.STRUCTURE_TYPE,
         c.STANDARD_INCHI,
-        c.STANDARD_INCHI_KEY,
         c.CANONICAL_SMILES,
-        d.DOC_ID,
-        d.PUBMED_ID,
-        d.DOI,
-        cr.SRC_ID,
-        cr.SRC_COMPOUND_ID, 
-        sr.SRC_SHORT_NAME, 
-        sr.SRC_DESCRIPTION
+        cr.COMPOUND_NAME
     FROM 
         MOLECULE_DICTIONARY as a
     LEFT JOIN 
         COMPOUND_STRUCTURES as c ON a.MOLREGNO = c.MOLREGNO
     LEFT JOIN 
         ACTIVITIES as p ON a.MOLREGNO = p.MOLREGNO
-    LEFT JOIN 
-        DOCS as d ON p.DOC_ID = d.DOC_ID
     LEFT JOIN 
         compound_records as cr ON a.MOLREGNO = cr.MOLREGNO
     LEFT JOIN
diff --git a/scripts/gdc_scan.py b/scripts/gdc_scan.py
new file mode 100644
index 0000000..9f53ea0
--- /dev/null
+++ b/scripts/gdc_scan.py
@@ -0,0 +1,297 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+import logging
+import os
+import sys
+import shutil
+import time
+from tqdm import tqdm
+
+import requests
+
+URL_BASE = "https://api.gdc.cancer.gov/"
+TOKEN = None
+client = requests
+
+
+def get_file(file_id, path):
+    """ download a file from gdc, save in path """
+    if os.path.isfile(path):
+        return path
+    endpoint = 'data/{}'.format(file_id)
+    req = client.get(URL_BASE + endpoint)
+    if not os.path.exists( os.path.dirname(path) ):
+        os.makedirs(os.path.dirname(path))
+    with open(path, 'wb') as out:
+        out.write(req.content)
+    return path
+
+
+def query_gdc(endpoint, params):
+    """
+    query_gdc makes a query to the GDC API while handling common issues
+    like pagination, retries, etc.
+
+    The return value is an iterator.
+    """
+    # Copy input params to avoid modification.
+    params = dict(params)
+    page_size = 100
+    params['size'] = page_size
+
+    # With a GET request, the filters parameter needs to be converted
+    # from a dictionary to JSON-formatted string
+    if 'filters' in params:
+        params['filters'] = json.dumps(params['filters'])
+
+    headers = None
+    if TOKEN is not None:
+        headers = {
+            "X-Auth-Token" : TOKEN
+        }
+    failCount = 0
+    # Iterate through all the pages.
+    with tqdm(total=page_size) as pbar:
+        while True:
+            try:
+                req = client.get(URL_BASE + endpoint, params=params, headers=headers)
+                data = req.json()
+                
+                if 'data' not in data:
+                    print("Bad return %s" % (data))
+                    failCount += 1
+                    if failCount >= 10:
+                        raise Exception("Too many failures")
+                    time.sleep(10)
+                else:
+                    failCount = 0
+                    data = data['data']
+                    hits = data.get("hits", [])
+                    if len(hits) == 0:
+                        return
+                    for hit in hits:
+                        yield hit
+                    pbar.total = data['pagination']['total']
+                    pbar.update( data['pagination']['count'] )
+                    # Get the next page.
+                    params['from'] = data['pagination']['from'] + page_size
+            except Exception as e:
+                if failCount >= 10:
+                    logging.warning(str(e))
+                    logging.warning(json.dumps(params))
+                    raise
+                failCount += 1
+                print("Connection Issue %s" % (e))
+                time.sleep(10)
+
+# The GDC API requires you to request that nested fields be expanded.
+# https://docs.gdc.cancer.gov/API/Users_Guide/Appendix_A_Available_Fields/#cases-field-groups
+#
+# Note that (as of this writing) we are expanding most but
+# not all possible fields. Mostly we're skipping "files" data.
+expand_case_fields = ",".join("""
+demographic
+diagnoses
+diagnoses.treatments
+exposures
+family_histories
+project
+project.program
+samples
+samples.annotations
+samples.portions
+samples.portions.analytes
+samples.portions.analytes.aliquots
+samples.portions.analytes.aliquots.annotations
+samples.portions.analytes.aliquots.center
+samples.portions.analytes.annotations
+samples.portions.annotations
+samples.portions.center
+samples.portions.slides
+samples.portions.slides.annotations
+summary
+summary.data_categories
+summary.experimental_strategies
+tissue_source_site
+type
+""".strip().split())
+
+# These are the fields we want to keep from the GDC Case (BMEG Case).
+keep_case_fields = """
+diagnoses
+demographic
+disease_type
+primary_site
+summary
+project
+""".strip().split()
+
+expand_project_fields = ",".join("""
+dbgap_accession_number
+disease_type
+name
+primary_site
+project_id
+released
+state
+program
+summary
+""".strip().split())
+
+
+def scrapeProjects(outfile):
+    projectOut = open(outfile, "w")
+    for row in query_gdc("projects", {"expand": expand_project_fields}):
+        projectOut.write(json.dumps(row))
+        projectOut.write("\n")
+    projectOut.close()
+
+
+def scrapeCases(outfile):
+    # Crawl all cases, samples, aliquots to generate
+    # BMEG Cases, Samples, and Aliquots.
+    parameters={}
+    parameters['expand'] = expand_case_fields
+    case_gids = []
+    caseOut = open(outfile, "w")
+
+    for row in query_gdc("cases", parameters):
+        caseOut.write(json.dumps(row))
+        caseOut.write("\n")
+
+    caseOut.close()
+
+def scrapeCompounds(outdir):
+    """ the only way to get drugs is to download files and parse them"""
+    my_filters = json.loads("""
+    {"op":"and","content":[{"op":"in","content":{"field":"files.data_type","value":["Clinical data"]}},{"op":"in","content":{"field":"files.tags","value":["drug"]}}]}
+    """)
+
+    parameters = {'filters' : my_filters}
+    for row in query_gdc("legacy/files", parameters):
+        get_file(row['file_id'], '{}/{}.tsv'.format(outdir, row['file_id']))
+
+def scrapeFiles(outfile):
+    parameters={}
+    parameters['expand'] = ",".join(["cases", "cases.aliquot_ids", "cases.project", "cases.samples.portions.analytes.aliquots", "index_files"])
+
+    filesOut = open(outfile, "w")
+
+    for row in query_gdc("files", parameters):
+        filesOut.write(json.dumps(row))
+        filesOut.write("\n")
+    filesOut.close()
+
+def scrapeExpression(outdir):
+    parameters = { "filters" : {
+        "op" : "and",
+        "content":[{
+            "op" : "in",
+            "content": {
+                "field" : "data_category",
+                "value":["Transcriptome Profiling"]
+            }
+        },{
+            "op" : "in",
+            "content": {
+                "field" : "access",
+                "value":["open"]
+            }
+        },{
+            "op" : "in",
+            "content" : {
+                "field" : "experimental_strategy",
+                "value":["RNA-Seq"]
+            }
+        }]
+    } }
+    for row in query_gdc("files", parameters):
+        outPath = '{}/{}.tsv'.format(outdir, row['file_id'])
+        if not os.path.exists(outPath):
+            get_file(row['file_id'], outPath + ".tmp" )
+            shutil.move(outPath + ".tmp", outPath)
+        #print(row)
+
+
+def scrapeOpenMaf(outdir):
+    parameters = { "filters" : {
+        "op" : "and",
+        "content":[{
+            "op" : "in",
+            "content": {
+                "field" : "data_category",
+                "value":["Simple Nucleotide Variation"]
+            }
+        },{
+            "op" : "in",
+            "content": {
+                "field" : "access",
+                "value":["open"]
+            }
+        }]
+    } }
+    for row in query_gdc("files", parameters):
+        outPath = '{}/{}.maf.gz'.format(outdir, row['file_id'])
+        if not os.path.exists(outPath):
+            get_file(row['file_id'], outPath + ".tmp" )
+            shutil.move(outPath + ".tmp", outPath)
+        #print(row)
+
+def scrapeControlledMaf(outdir):
+    parameters = { "filters" : {
+        "op" : "and",
+        "content":[{
+            "op" : "in",
+            "content": {
+                "field" : "data_category",
+                "value":["Simple Nucleotide Variation"]
+            }
+        },{
+            "op" : "in",
+            "content": {
+                "field" : "access",
+                "value":["controlled"]
+            }
+        }]
+    } }
+    for row in query_gdc("files", parameters):
+        outPath = '{}/{}.maf.gz'.format(outdir, row['file_id'])
+        if not os.path.exists(outPath):
+            get_file(row['file_id'], outPath + ".tmp" )
+            shutil.move(outPath + ".tmp", outPath)
+        #print(row)
+
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-e", "--endpoint", default=URL_BASE)
+    parser.add_argument("-t", "--token", default=None)
+    parser.add_argument("method")
+    parser.add_argument("dest")
+
+    args = parser.parse_args()
+
+    URL_BASE = args.endpoint
+    if args.token is not None:
+        with open(args.token, "rt") as handle:
+            TOKEN = handle.read().strip()
+
+    if args.method == "projects":
+        scrapeProjects(args.dest)
+    if args.method == "cases":
+        scrapeCases(args.dest)
+    if args.method == "files":
+        scrapeFiles(args.dest)
+    if args.method == "compounds":
+        scrapeCompounds(args.dest)
+    if args.method == "expression":
+        scrapeExpression(args.dest)
+    if args.method == "open-maf":
+        scrapeOpenMaf(args.dest)
+    if args.method == "controlled-maf":
+        scrapeControlledMaf(args.dest)

From 05a3c0d5ddf0ea988d56828b252a5ea241ff22f7 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Wed, 9 Oct 2024 10:50:11 -0700
Subject: [PATCH 19/24] htan cli

---
 fhirizer/cli.py       |  20 ++-
 fhirizer/htan2fhir.py | 363 +++++++++++++++++++++---------------------
 2 files changed, 191 insertions(+), 192 deletions(-)

diff --git a/fhirizer/cli.py b/fhirizer/cli.py
index 69e0189..209a42f 100644
--- a/fhirizer/cli.py
+++ b/fhirizer/cli.py
@@ -1,5 +1,4 @@
-from fhirizer import utils, mapping, entity2fhir
-from fhirizer import icgc2fhir
+from fhirizer import utils, mapping, entity2fhir, icgc2fhir, htan2fhir
 import click
 from pathlib import Path
 
@@ -141,10 +140,10 @@ def convert(name, in_path, out_path, verbose):
               show_default=True,
               help='entity name to map - project, case, file of GDC or cellosaurus')
 @click.option('--out_dir', cls=NotRequiredIf,
-              not_required_if='icgc',
+              not_required_if='htan',
               help='Directory path to save mapped FHIR ndjson files.')
 @click.option('--entity_path', cls=NotRequiredIf,
-              not_required_if='icgc',
+              not_required_if='htan',
               help='Path to GDC entity with mapped FHIR like keys (converted file via convert). '
                    'or Cellosaurus ndjson file of human cell-lines of interest')
 @click.option('--icgc', help='Name of the ICGC project to FHIRize.')
@@ -153,10 +152,13 @@ def convert(name, in_path, out_path, verbose):
 @click.option('--convert', is_flag=True, help='Boolean indicating to write converted keys to directory')
 @click.option('--verbose', is_flag=True)
 def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose):
-    name_list = ['project', 'case', 'file', 'cellosaurus', 'icgc']
-    assert name in ['project', 'case', 'file', 'cellosaurus', 'icgc'], f'--name is not in {name_list}.'
-    assert Path(out_dir).is_dir(), f"Path {out_dir} is not a valid directory path."
-    assert Path(entity_path).is_file(), f"Path {entity_path} is not a valid file path."
+    name_list = ['project', 'case', 'file', 'cellosaurus', 'icgc', 'htan']
+    assert name in name_list, f'--name is not in {name_list}.'
+    if name != 'htan':
+        assert Path(out_dir).is_dir(), f"Path {out_dir} is not a valid directory path."
+        assert Path(entity_path).is_file(), f"Path {entity_path} is not a valid file path."
+    else:
+        assert Path("./projects/HTAN").is_dir()
 
     if name in 'project':
         entity2fhir.project_gdc_to_fhir_ndjson(out_dir=out_dir, projects_path=entity_path, convert=convert, verbose=verbose)
@@ -168,6 +170,8 @@ def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose):
         entity2fhir.cellosaurus2fhir(out_dir=out_dir, path=entity_path)
     if name in 'icgc' and icgc:
         icgc2fhir.icgc2fhir(project_name=icgc, has_files=has_files)
+    if name in 'htan':
+        htan2fhir.htan2fhir(verbose=verbose)
 
 
 if __name__ == '__main__':
diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index 5fe4846..edce3a9 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -1,5 +1,6 @@
 import uuid
 import json
+import warnings
 
 import numpy as np
 import orjson
@@ -306,7 +307,7 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien
         observation_fields = []
 
         if official_focus in ["Patient", "Condition"]:
-            mappings = transformer.cases_mappings()
+            mappings = self.cases_mappings()
             code = {
                 "coding": [
                     {
@@ -319,11 +320,11 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien
             }
 
         elif official_focus in ["MedicationAdministration"]:
-            mappings = transformer.cases_mappings()
+            mappings = self.cases_mappings()
             code = self.med_admin_code
 
         elif official_focus in ["DocumentReference"]:
-            mappings = transformer.files_mappings()
+            mappings = self.files_mappings()
             code = {
                 "coding": [
                     {
@@ -336,7 +337,7 @@ def create_observation(self, _row: pd.Series, patient: Optional[Patient], patien
             }
 
         elif official_focus in ["Specimen"]:
-            mappings = transformer.biospecimen_mappings()
+            mappings = self.biospecimen_mappings()
             code = {
                 "coding": [
                     {
@@ -538,9 +539,9 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat
                     drugname_fhir_ids.update({drug: medication.id})
 
             if substance_definitions:
-                transformer.write_ndjson(substance_definitions)
+                self.write_ndjson(substance_definitions)
             if substances:
-                transformer.write_ndjson(substances)
+                self.write_ndjson(substances)
 
             cases['Medication_ID'] = cases['Therapeutic Agents'].map(drugname_fhir_ids, na_action='ignore')
 
@@ -555,7 +556,7 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat
                 cases.loc[index, 'Medication_ID'] = drugname_fhir_ids[row['Therapeutic Agents']]
 
         if medications:
-            transformer.write_ndjson(medications)
+            self.write_ndjson(medications)
         if 'Medication_ID' in cases.columns:
             return cases
 
@@ -610,7 +611,7 @@ def create_patient(self, _row: pd.Series) -> Patient:
 
     def patient_observation(self, patient: Patient, _row: pd.Series) -> Observation:
         patient_observation_fields = []
-        for field, fhir_map, use, focus in self.get_fields_by_fhir_map(transformer.cases_mappings(),
+        for field, fhir_map, use, focus in self.get_fields_by_fhir_map(self.cases_mappings(),
                                                                        "Observation.component"):
             if focus == "Patient":
                 patient_observation_fields.append(field)
@@ -1001,179 +1002,173 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
 
 # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
 # 12/14 total Atlas
-# TNP_SARDANA drug name syntax error
-atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford",
-              "Vanderbilt"]
-
-# atlas_name = ["OHSU"]
-db_path = '../../bmeg_backup_0516/bmeg-etl_chembl/source/chembl/chembl_34/chembl_34_sqlite/chembl_34.db'
-
-
-for name in atlas_name:
-    print(f"Transforming {name}")
-
-    transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=False)
-    patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
-                                             verbose=False)
-    specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
-                                               verbose=False)
-    documentreference_transformer = DocumentReferenceTransformer(subprogram_name=name,
-                                                                 out_dir=f"./projects/HTAN/{name}/META",
-                                                                 verbose=False)
-
-    patient_demographics_df = transformer.patient_demographics
-    cases = transformer.cases
-    htan_biospecimens = transformer.biospecimens
-    files = transformer.files
-    files_drs_meta = transformer.files_drs_meta
-
-    patients = []
-    research_studies = []
-    research_subjects = []
-    conditions = []
-    encounters = []
-    observations = []
-    med_admins = []
-
-    if not cases["Therapeutic Agents"].isnull().all() or not cases["Treatment Type"].isnull().all():
-        cases = transformer.transform_medication(cases, db_file_path=db_path)
-
-    for index, row in cases.iterrows():
-        research_study = patient_transformer.create_researchstudy(_row=row)
-
-        if research_study:
-            research_studies.append(transformer.program_research_study)
-            research_studies.append(research_study)
-
-            patient_row = cases.iloc[index][patient_demographics_df.columns]
-            patient = patient_transformer.create_patient(_row=patient_row)
-            patient_obs = patient_transformer.patient_observation(patient=patient, _row=row)
-            if patient_obs:
-                observations.append(patient_obs)
-            if patient:
-                patients.append(patient)
-                # print(f"HTAN FHIR Patient: {patient.json()}")
-                # print(f"HTAN FHIR Patient Observation: {patient_obs.json()}")
-
-                research_subject = patient_transformer.create_researchsubject(patient, research_study)
-                if research_subject:
-                    research_subjects.append(research_subject)
-
-                encounter = patient_transformer.create_encounter(_row=row, patient=patient, condition=None,
-                                                                 procedure=None)
-                if encounter:
-                    encounters.append(encounter)
-                    condition = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter,
-                                                                     body_structure=None)
-
-                    if condition:
-                        conditions.append(condition)
-
-                        condition_observation = patient_transformer.create_observation(_row=row, patient=patient,
-                                                                                       patient_id=patient.id,
-                                                                                       official_focus="Condition",
-                                                                                       focus=[Reference(**{
-                                                                                           "reference": f"Condition/{condition.id}"})],
-                                                                                       specimen=None, components=None,
-                                                                                       category=None,
-                                                                                       relax=False)
-                        if condition_observation:
-                            observations.append(condition_observation)
-
-                if not pd.isnull(row["Treatment Type"]):
-                    med_admin = patient_transformer.create_medication_administration(_row=row,
-                                                                                     patient_id=patient.id)
-                    if med_admin:
-                        med_admins.append(med_admin)
-                        med_admin_observation = patient_transformer.create_observation(_row=row, patient=None,
-                                                                                       official_focus="MedicationAdministration",
-                                                                                       focus=[Reference(**{
-                                                                                           "reference": f"MedicationAdministration/{med_admin.id}"})],
-                                                                                       patient_id=patient.id,
-                                                                                       specimen=None, components=None,
-                                                                                       category=None,
-                                                                                       relax=False)
-                        if med_admin_observation:
-                            observations.append(med_admin_observation)
-
-    specimens = []
-    for specimen_index, specimen_row in htan_biospecimens.iterrows():
-        # specimen_row = htan_biospecimens.iloc[specimen_index]
-        specimen = specimen_transformer.create_specimen(_row=specimen_row)
-        if specimen:
-            specimens.append(specimen)
-
-            participant_id = specimen_transformer.decipher_htan_id(specimen_row["HTAN Biospecimen ID"])[
-                "participant_id"]
-            assert participant_id, f"Specimen {specimen_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it."
-
-            specimen_participant_id = specimen_transformer.get_patient_id(participant_id=participant_id)
-            specimen_observation = specimen_transformer.create_observation(_row=specimen_row, patient=None,
-                                                                           official_focus="Specimen",
-                                                                           focus=[Reference(**{
-                                                                               "reference": f"Specimen/{specimen.id}"})],
-                                                                           patient_id=specimen_participant_id,
-                                                                           specimen=specimen, components=None,
-                                                                           category=transformer.lab_category,
-                                                                           relax=False)
-            if specimen_observation:
-                observations.append(specimen_observation)
-
-    specimen_ids = [s.id for s in specimens]
-    patient_ids = [p.id for p in patients]
-    document_references = []
-    for document_reference_index, document_reference_row in files_drs_meta.iterrows():
-        docref = documentreference_transformer.create_document_reference(_row=document_reference_row,
-                                                                         specimen_ids=specimen_ids)
-        if docref:
-            document_references.append(docref)
-
-            docref_patient_id = None
-            if 'HTAN Participant ID' in document_reference_row.keys() and pd.isnull(
-                    document_reference_row['HTAN Participant ID']):
-                docref_patient = documentreference_transformer.get_patient_id(
-                    participant_id=document_reference_row['HTAN Participant ID'])
-                if docref_patient in patient_ids:
-                    docref_patient_id = docref_patient
-            # else:
-            #    print(f"HTAN {name} is missing patient reference in files")
-
-            document_reference_observation = documentreference_transformer.create_observation(
-                _row=document_reference_row, patient=None,
-                official_focus="DocumentReference",
-                focus=[Reference(**{
-                    "reference": f"DocumentReference/{docref.id}"})],
-                patient_id=docref_patient_id,
-                specimen=None, components=None,
-                category=transformer.lab_category,
-                relax=True)
-
-            if document_reference_observation:
-                observations.append(document_reference_observation)
-
-    if research_subjects:
-        transformer.write_ndjson(research_subjects)
-    if research_studies:
-        transformer.write_ndjson(research_studies)
-    if patients:
-        transformer.write_ndjson(patients)
-    if encounters:
-        transformer.write_ndjson(encounters)
-    if conditions:
-        transformer.write_ndjson(conditions)
-    if observations:
-        transformer.write_ndjson(observations)
-    if specimens:
-        transformer.write_ndjson(specimens)
-    if document_references:
-        transformer.write_ndjson(document_references)
-    if med_admins:
-        transformer.write_ndjson(med_admins)
-
-
-    # participant ids from specimen identifiers
-    # print(transformer.decipher_htan_id(htan_biospecimens["HTAN Biospecimen ID"][0]))
-    # print(transformer.decipher_htan_id(cases["HTAN Participant ID"][0]))
-
-# make all possible medications in cases
-# point to the medication in MedicationAdministartion - Medication -> Substance -> SubstanceDefination 0...* [representaiton.format representation.string]
+def htan2fhir(verbose):
+    warnings.filterwarnings('ignore')
+    atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford",
+                  "Vanderbilt"]
+    # TNP_SARDANA drug name syntax error
+    db_path = str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db'))
+
+    for name in atlas_name:
+        if verbose:
+            print(f"Transforming {name}")
+
+        transformer = HTANTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META", verbose=verbose)
+        patient_transformer = PatientTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
+                                                 verbose=verbose)
+        specimen_transformer = SpecimenTransformer(subprogram_name=name, out_dir=f"./projects/HTAN/{name}/META",
+                                                   verbose=verbose)
+        documentreference_transformer = DocumentReferenceTransformer(subprogram_name=name,
+                                                                     out_dir=f"./projects/HTAN/{name}/META",
+                                                                     verbose=verbose)
+
+        patient_demographics_df = transformer.patient_demographics
+        cases = transformer.cases
+        htan_biospecimens = transformer.biospecimens
+        files = transformer.files
+        files_drs_meta = transformer.files_drs_meta
+
+        patients = []
+        research_studies = []
+        research_subjects = []
+        conditions = []
+        encounters = []
+        observations = []
+        med_admins = []
+
+        if not cases["Therapeutic Agents"].isnull().all() or not cases["Treatment Type"].isnull().all():
+            cases = transformer.transform_medication(cases, db_file_path=db_path)
+
+        for index, row in cases.iterrows():
+            research_study = patient_transformer.create_researchstudy(_row=row)
+
+            if research_study:
+                research_studies.append(transformer.program_research_study)
+                research_studies.append(research_study)
+
+                patient_row = cases.iloc[index][patient_demographics_df.columns]
+                patient = patient_transformer.create_patient(_row=patient_row)
+                patient_obs = patient_transformer.patient_observation(patient=patient, _row=row)
+                if patient_obs:
+                    observations.append(patient_obs)
+                if patient:
+                    patients.append(patient)
+                    # print(f"HTAN FHIR Patient: {patient.json()}")
+                    # print(f"HTAN FHIR Patient Observation: {patient_obs.json()}")
+
+                    research_subject = patient_transformer.create_researchsubject(patient, research_study)
+                    if research_subject:
+                        research_subjects.append(research_subject)
+
+                    encounter = patient_transformer.create_encounter(_row=row, patient=patient, condition=None,
+                                                                     procedure=None)
+                    if encounter:
+                        encounters.append(encounter)
+                        condition = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter,
+                                                                         body_structure=None)
+
+                        if condition:
+                            conditions.append(condition)
+
+                            condition_observation = patient_transformer.create_observation(_row=row, patient=patient,
+                                                                                           patient_id=patient.id,
+                                                                                           official_focus="Condition",
+                                                                                           focus=[Reference(**{
+                                                                                               "reference": f"Condition/{condition.id}"})],
+                                                                                           specimen=None, components=None,
+                                                                                           category=None,
+                                                                                           relax=False)
+                            if condition_observation:
+                                observations.append(condition_observation)
+
+                    if not pd.isnull(row["Treatment Type"]):
+                        med_admin = patient_transformer.create_medication_administration(_row=row,
+                                                                                         patient_id=patient.id)
+                        if med_admin:
+                            med_admins.append(med_admin)
+                            med_admin_observation = patient_transformer.create_observation(_row=row, patient=None,
+                                                                                           official_focus="MedicationAdministration",
+                                                                                           focus=[Reference(**{
+                                                                                               "reference": f"MedicationAdministration/{med_admin.id}"})],
+                                                                                           patient_id=patient.id,
+                                                                                           specimen=None, components=None,
+                                                                                           category=None,
+                                                                                           relax=False)
+                            if med_admin_observation:
+                                observations.append(med_admin_observation)
+
+        specimens = []
+        for specimen_index, specimen_row in htan_biospecimens.iterrows():
+            # specimen_row = htan_biospecimens.iloc[specimen_index]
+            specimen = specimen_transformer.create_specimen(_row=specimen_row)
+            if specimen:
+                specimens.append(specimen)
+
+                participant_id = specimen_transformer.decipher_htan_id(specimen_row["HTAN Biospecimen ID"])[
+                    "participant_id"]
+                assert participant_id, f"Specimen {specimen_row["HTAN Biospecimen ID"]} does not have a patient participant associated with it."
+
+                specimen_participant_id = specimen_transformer.get_patient_id(participant_id=participant_id)
+                specimen_observation = specimen_transformer.create_observation(_row=specimen_row, patient=None,
+                                                                               official_focus="Specimen",
+                                                                               focus=[Reference(**{
+                                                                                   "reference": f"Specimen/{specimen.id}"})],
+                                                                               patient_id=specimen_participant_id,
+                                                                               specimen=specimen, components=None,
+                                                                               category=transformer.lab_category,
+                                                                               relax=False)
+                if specimen_observation:
+                    observations.append(specimen_observation)
+
+        specimen_ids = [s.id for s in specimens]
+        patient_ids = [p.id for p in patients]
+        document_references = []
+        for document_reference_index, document_reference_row in files_drs_meta.iterrows():
+            docref = documentreference_transformer.create_document_reference(_row=document_reference_row,
+                                                                             specimen_ids=specimen_ids)
+            if docref:
+                document_references.append(docref)
+
+                docref_patient_id = None
+                if 'HTAN Participant ID' in document_reference_row.keys() and pd.isnull(
+                        document_reference_row['HTAN Participant ID']):
+                    docref_patient = documentreference_transformer.get_patient_id(
+                        participant_id=document_reference_row['HTAN Participant ID'])
+                    if docref_patient in patient_ids:
+                        docref_patient_id = docref_patient
+                # else:
+                #    print(f"HTAN {name} is missing patient reference in files")
+
+                document_reference_observation = documentreference_transformer.create_observation(
+                    _row=document_reference_row, patient=None,
+                    official_focus="DocumentReference",
+                    focus=[Reference(**{
+                        "reference": f"DocumentReference/{docref.id}"})],
+                    patient_id=docref_patient_id,
+                    specimen=None, components=None,
+                    category=transformer.lab_category,
+                    relax=True)
+
+                if document_reference_observation:
+                    observations.append(document_reference_observation)
+
+        if research_subjects:
+            transformer.write_ndjson(research_subjects)
+        if research_studies:
+            transformer.write_ndjson(research_studies)
+        if patients:
+            transformer.write_ndjson(patients)
+        if encounters:
+            transformer.write_ndjson(encounters)
+        if conditions:
+            transformer.write_ndjson(conditions)
+        if observations:
+            transformer.write_ndjson(observations)
+        if specimens:
+            transformer.write_ndjson(specimens)
+        if document_references:
+            transformer.write_ndjson(document_references)
+        if med_admins:
+            transformer.write_ndjson(med_admins)
+
+# for i in $(ls projects/HTAN); do g3t meta validate projects/HTAN/$i/META; done
\ No newline at end of file

From 57bbf0d7274b9949797111e715d41fe85ca5e3e8 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Wed, 9 Oct 2024 11:11:15 -0700
Subject: [PATCH 20/24] updated readme

---
 README.md | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3968812..e5a89e6 100644
--- a/README.md
+++ b/README.md
@@ -77,8 +77,33 @@ Detailed step-by-step guide on FHIRizing data for a project's study can be found
   ```
 - HTAN
   
+FHIRizing HTAN depends on the: 
+1. Folder hierarchy with naming conventions as below and existance of raw data pulled from HTAN
+```
+fhirizer/
+|-- projects/
+|   └── HTAN/ 
+|         └── OHSU/
+|               |-- raw/ 
+|               |    |--  files/
+|               |    |      |-- table_data.tsv
+|               |    |      └── cds_manifest.csv
+|               |    |--  biospecimens/table_data.tsv
+|               |    └──  cases/table_data.tsv
+|               └── META/
+```
+2. existance of chembl DB file
+```
+fhirizer/
+|-- resources/
+      └── chembl_resources/chembl_34.db
+
+```
+
+Example run: 
+
   ```
-   fhirizer generate --name htan --out_dir ./projects/<my-project>/META --entity_path ./projects/<my-project>/
+   fhirizer generate --name htan 
   ```
 ### Constructing GDC maps cli cmds 
 

From 683335deb74dd393a352fb7de6ba3c0d30e3a519 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Thu, 10 Oct 2024 06:08:31 -0700
Subject: [PATCH 21/24] pass list to notrequired cli options - check for db
 file

---
 fhirizer/cli.py       | 19 +++++++++++--------
 fhirizer/htan2fhir.py |  1 +
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/fhirizer/cli.py b/fhirizer/cli.py
index 209a42f..4f0324b 100644
--- a/fhirizer/cli.py
+++ b/fhirizer/cli.py
@@ -7,21 +7,23 @@ class NotRequiredIf(click.Option):
     def __init__(self, *args, **kwargs):
         self.not_required_if = kwargs.pop('not_required_if')
         assert self.not_required_if, "'not_required_if' parameter required"
+        if isinstance(self.not_required_if, str):
+            self.not_required_if = [self.not_required_if]
         kwargs['help'] = (kwargs.get('help', '') +
                           ' NOTE: This argument is mutually exclusive with %s' %
-                          self.not_required_if
+                          ', '.join(self.not_required_if)
                           ).strip()
         super(NotRequiredIf, self).__init__(*args, **kwargs)
 
     def handle_parse_result(self, ctx, opts, args):
         we_are_present = self.name in opts
-        other_present = self.not_required_if in opts
+        others_present = [opt for opt in self.not_required_if if opt in opts]
 
-        if other_present:
+        if others_present:
             if we_are_present:
                 raise click.UsageError(
                     "Illegal usage: `%s` is mutually exclusive with `%s`" % (
-                        self.name, self.not_required_if))
+                        self.name, ', '.join(others_present)))
             else:
                 self.prompt = None
 
@@ -29,6 +31,7 @@ def handle_parse_result(self, ctx, opts, args):
             ctx, opts, args)
 
 
+
 @click.group()
 def cli():
     """GDC, Cellosaurus, ICGC to FHIR schema Key and Content Mapping"""
@@ -140,12 +143,12 @@ def convert(name, in_path, out_path, verbose):
               show_default=True,
               help='entity name to map - project, case, file of GDC or cellosaurus')
 @click.option('--out_dir', cls=NotRequiredIf,
-              not_required_if='htan',
+              not_required_if=['htan', 'icgc'],
               help='Directory path to save mapped FHIR ndjson files.')
 @click.option('--entity_path', cls=NotRequiredIf,
-              not_required_if='htan',
-              help='Path to GDC entity with mapped FHIR like keys (converted file via convert). '
-                   'or Cellosaurus ndjson file of human cell-lines of interest')
+              not_required_if=['htan', 'icgc'],
+              help='Path to GDC entity with mapped FHIR like keys (converted file via convert) or Cellosaurus ndjson '
+                   'file of human cell-lines of interest.')
 @click.option('--icgc', help='Name of the ICGC project to FHIRize.')
 @click.option('--has_files', is_flag=True, help='Boolean indicating file metatda via new argo site is available @ '
                                                 'ICGC/{project}/data directory to FHIRize.')
diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index edce3a9..04c4b75 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -1008,6 +1008,7 @@ def htan2fhir(verbose):
                   "Vanderbilt"]
     # TNP_SARDANA drug name syntax error
     db_path = str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db'))
+    assert Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db').is_file(), f"chEMBL db file chembl_34.db does not exist."
 
     for name in atlas_name:
         if verbose:

From f1bebefb97a91eb5a34fac2a64649fc536a8077c Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Thu, 10 Oct 2024 08:52:12 -0700
Subject: [PATCH 22/24] htan condition stage

---
 fhirizer/htan2fhir.py | 142 +++++++++++++++++++++++++++++++++---------
 1 file changed, 111 insertions(+), 31 deletions(-)

diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index 04c4b75..e2e63a5 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -40,7 +40,8 @@
 from fhir.resources.medicationadministration import MedicationAdministration
 from fhir.resources.medication import Medication, MedicationIngredient
 from fhir.resources.substance import Substance, SubstanceIngredient
-from fhir.resources.substancedefinition import SubstanceDefinition,SubstanceDefinitionStructure,  SubstanceDefinitionStructureRepresentation, SubstanceDefinitionName
+from fhir.resources.substancedefinition import SubstanceDefinition, SubstanceDefinitionStructure, \
+    SubstanceDefinitionStructureRepresentation, SubstanceDefinitionName
 
 
 # File data on synapse after authentication
@@ -80,14 +81,14 @@ def __init__(self, subprogram_name: str, out_dir: str, verbose: bool):
             }
         ]
         self.med_admin_code = {
-                "coding": [
-                  {
+            "coding": [
+                {
                     "system": "http://loinc.org",
                     "code": "80565-5",
                     "display": "Medication administration record"
-                  }
-                ],
-                "text": "Medication administration record"
+                }
+            ],
+            "text": "Medication administration record"
         }
         parent_researchstudy_identifier = Identifier(**{"system": self.SYSTEM_HTAN, "use": "official", "value": "HTAN"})
         parent_researchstudy_id = self.mint_id(identifier=parent_researchstudy_identifier,
@@ -410,6 +411,7 @@ def get_patient_id(self, participant_id) -> str:
         patient_id = self.mint_id(identifier=patient_identifier, resource_type="Patient", project_id=self.project_id,
                                   namespace=self.NAMESPACE_HTAN)
         return patient_id
+
     @staticmethod
     def create_substance_definition_representations(df: pd.DataFrame) -> list:
         representations = []
@@ -431,7 +433,8 @@ def create_substance_definition_representations(df: pd.DataFrame) -> list:
 
     def create_substance_definition(self, compound_name: str, representations: list) -> SubstanceDefinition:
         sub_def_identifier = Identifier(**{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"})
-        sub_def_id = self.mint_id(identifier=sub_def_identifier, resource_type="SubstanceDefinition", project_id=self.project_id,
+        sub_def_id = self.mint_id(identifier=sub_def_identifier, resource_type="SubstanceDefinition",
+                                  project_id=self.project_id,
                                   namespace=self.NAMESPACE_HTAN)
 
         return SubstanceDefinition(**{"id": sub_def_id,
@@ -440,18 +443,20 @@ def create_substance_definition(self, compound_name: str, representations: list)
                                       "name": [SubstanceDefinitionName(**{"name": compound_name})]
                                       })
 
-    def create_substance(self, compound_name:str, substance_definition: SubstanceDefinition) -> Substance:
+    def create_substance(self, compound_name: str, substance_definition: SubstanceDefinition) -> Substance:
         code = None
         if substance_definition:
             code = CodeableReference(
-                **{"concept": CodeableConcept(**{"coding": [{"code": compound_name, "system": "/".join([self.SYSTEM_chEMBL, "compound_name"]), "display": compound_name}]}),
+                **{"concept": CodeableConcept(**{"coding": [
+                    {"code": compound_name, "system": "/".join([self.SYSTEM_chEMBL, "compound_name"]),
+                     "display": compound_name}]}),
                    "reference": Reference(**{"reference": f"SubstanceDefinition/{substance_definition.id}"})})
 
         sub_identifier = Identifier(
             **{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"})
         sub_id = self.mint_id(identifier=sub_identifier, resource_type="Substance",
-                                  project_id=self.project_id,
-                                  namespace=self.NAMESPACE_HTAN)
+                              project_id=self.project_id,
+                              namespace=self.NAMESPACE_HTAN)
 
         return Substance(**{"id": sub_id,
                             "identifier": [sub_identifier],
@@ -461,7 +466,8 @@ def create_substance(self, compound_name:str, substance_definition: SubstanceDef
                                                                         "display": "Drug or Medicament"}]})],
                             "code": code})
 
-    def create_medication(self, compound_name: Optional[str], treatment_type: Optional[str], _substance: Optional[Substance]) -> Medication:
+    def create_medication(self, compound_name: Optional[str], treatment_type: Optional[str],
+                          _substance: Optional[Substance]) -> Medication:
         code = None
         med_identifier = None
         if compound_name:
@@ -480,12 +486,13 @@ def create_medication(self, compound_name: Optional[str], treatment_type: Option
                 **{"system": self.SYSTEM_HTAN, "value": treatment_type, "use": "official"})
 
         med_id = self.mint_id(identifier=med_identifier, resource_type="Medication",
-                                  project_id=self.project_id,
-                                  namespace=self.NAMESPACE_HTAN)
+                              project_id=self.project_id,
+                              namespace=self.NAMESPACE_HTAN)
 
         ingredients = []
         if _substance:
-            ingredients.append(MedicationIngredient(**{"item": CodeableReference(**{"reference": Reference(**{"reference": f"Substance/{_substance.id}"})})}))
+            ingredients.append(MedicationIngredient(**{
+                "item": CodeableReference(**{"reference": Reference(**{"reference": f"Substance/{_substance.id}"})})}))
 
         return Medication(**{"id": med_id,
                              "identifier": [med_identifier],
@@ -519,7 +526,7 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat
                 if drug_info["has_info"].any():
                     drug_representations = self.create_substance_definition_representations(drug_info)
                     substance_definition = self.create_substance_definition(compound_name=drug,
-                                                                                   representations=drug_representations)
+                                                                            representations=drug_representations)
 
                     if substance_definition:
                         substance_definitions.append(substance_definition)
@@ -528,7 +535,8 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat
 
                     if substance:
                         substances.append(substance)
-                        medication = self.create_medication(compound_name=drug, _substance=substance, treatment_type=None)
+                        medication = self.create_medication(compound_name=drug, _substance=substance,
+                                                            treatment_type=None)
                         if medication:
                             medications.append(medication)
                             drugname_fhir_ids.update({drug: medication.id})
@@ -547,7 +555,8 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat
 
         for index, row in cases.iterrows():
             if pd.isnull(row["Therapeutic Agents"]) and not pd.isnull(row["Treatment Type"]):
-                medication_agent = self.create_medication(compound_name=None, _substance=None, treatment_type=row["Treatment Type"])
+                medication_agent = self.create_medication(compound_name=None, _substance=None,
+                                                          treatment_type=row["Treatment Type"])
                 if medication_agent:
                     medications.append(medication_agent)
                     cases.loc[index, 'Medication_ID'] = medication_agent.id
@@ -731,7 +740,7 @@ def create_body_structure(self, _row, patient: Patient) -> BodyStructure:
                })
 
     def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encounter,
-                         body_structure: Optional[BodyStructure]) -> Optional[Condition]:
+                         body_structure: Optional[BodyStructure], stage_observation: Optional[Observation]) -> Optional[Condition]:
         primary_diagnosis = _row.get("Primary Diagnosis")
         if pd.isnull(primary_diagnosis):
             return None
@@ -776,6 +785,10 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
             patient_body_site_cc = [CodeableConcept(**{"coding": [{"code": patient_body_site,
                                                                    "system": self.SYSTEM_HTAN,
                                                                    "display": patient_body_site}]})]
+        condition_stage = []
+        stages = self.create_stage(_row=_row, stage_observation=None)
+        if stages:
+            condition_stage = stages
 
         return Condition(**{"id": condition_id,
                             "identifier": [condition_identifier],
@@ -791,7 +804,7 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
                             "bodySite": patient_body_site_cc,
                             # "bodyStructure": patient_body_structure_ref,
                             "encounter": Reference(**{"reference": f"Encounter/{encounter.id}"}),
-                            "stage": [],
+                            "stage": condition_stage,
                             })
 
     def create_medication_administration(self, _row: pd.Series, patient_id: str) -> MedicationAdministration:
@@ -838,7 +851,7 @@ def create_medication_administration(self, _row: pd.Series, patient_id: str) ->
                 "status": status,
                 "occurenceDateTime": "2024-10-8T10:30:00.724446-05:00",
                 "category": [CodeableConcept(**{"coding": [{"code": _row["Treatment Type"],
-                                                            "system": "/".join([self.SYSTEM_HTAN, "Treatment_Type"]) ,
+                                                            "system": "/".join([self.SYSTEM_HTAN, "Treatment_Type"]),
                                                             "display": _row["Treatment Type"]}]})],
                 "medication": CodeableReference(**{"concept": medication_code, "reference": Reference(
                     **{"reference": f"Medication/{_row['Medication_ID']}"})}),
@@ -846,6 +859,66 @@ def create_medication_administration(self, _row: pd.Series, patient_id: str) ->
 
         return MedicationAdministration(**data)
 
+    def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation]) -> list:
+
+        assessment = []
+        if stage_observation:
+            assessment.append(Reference(**{"reference": f"Observation/{stage_observation.id}"}))
+
+        # find fields w Condition.stage.summary mappings
+        cancer_pathological_staging = utils._read_json(str(Path(importlib.resources.files(
+            'fhirizer').parent / 'resources' / 'gdc_resources' / 'content_annotations' / 'diagnosis' / 'cancer_pathological_staging.json')))
+
+        stage_fields = []
+        for field, fhir_map, use, focus in self.get_fields_by_fhir_map(self.cases_mappings(),
+                                                                       "Condition.stage.summary"):
+            if "Tumor Grade" in field or "AJCC Pathologic" in field:
+                stage_fields.append(field)
+
+        if stage_fields:
+            _stage_df = _row[stage_fields]
+
+        stages = []
+        for stage_field in stage_fields:
+            if not pd.isnull(_row[stage_field]):
+
+                types = []
+                summaries = []
+                for stage_info in cancer_pathological_staging:
+                    if _row[stage_field] == stage_info["value"]:
+                        type_system = {"code": stage_info["stage_type_sctid"],
+                                       "system": self.SYSTEM_SNOME,
+                                       "display": stage_info["stage_type_sctid_display"]}
+
+                        summary_htan_system = {"code": _row[stage_field],
+                                               "system": "/".join([self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]),
+                                               "display": _row[stage_field]}
+
+                        summary_snomed_system = {"code": stage_info["sctid"],
+                                                 "system": self.SYSTEM_SNOME,
+                                                 "display": stage_info["sctid_display"]}
+
+                        types.append(type_system)
+                        summaries.append(summary_htan_system)
+                        summaries.append(summary_snomed_system)
+                if not types:
+                    types.append({"code": "_".join(stage_field.lower().split(" ")),
+                                  "system": "/".join([self.SYSTEM_HTAN,  "_".join(stage_field.lower().split(" "))]),
+                                  "display": "_".join(stage_field.lower().split(" "))})
+
+                    summaries.append({"code": _row[stage_field],
+                                      "system": "/".join([self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]),
+                                      "display": _row[stage_field]})
+
+                condition_stage = ConditionStage(
+                    **{"summary": CodeableConcept(**{"coding": summaries}),
+                       "assessment": assessment,
+                       "type": CodeableConcept(**{"coding": types})})
+                if condition_stage:
+                    stages.append(condition_stage)
+
+        return stages
+
 
 class SpecimenTransformer(HTANTransformer):
     def __init__(self, *args: Any, **kwargs: Any):
@@ -1002,15 +1075,20 @@ def create_document_reference(self, _row: pd.Series, specimen_ids: list) -> Docu
 
 # 2 Projects that don't have files download or cds manifest SRRS and TNP_TMA (Oct/2024)
 # 12/14 total Atlas
-def htan2fhir(verbose):
+def htan2fhir(verbose, entity_atlas_name):
     warnings.filterwarnings('ignore')
-    atlas_name = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford",
-                  "Vanderbilt"]
+
+    atlas_names = ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford",
+                   "Vanderbilt", "TNP_SARDANA"]
+    assert entity_atlas_name not in atlas_names, f"Please provide a valid HTAN Atlas name in:  {atlas_names}"
+
     # TNP_SARDANA drug name syntax error
-    db_path = str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db'))
-    assert Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db').is_file(), f"chEMBL db file chembl_34.db does not exist."
+    db_path = str(
+        Path(importlib.resources.files('fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db'))
+    assert Path(importlib.resources.files(
+        'fhirizer').parent / 'resources' / 'chembl_resources' / 'chembl_34.db').is_file(), f"chEMBL db file chembl_34.db does not exist."
 
-    for name in atlas_name:
+    for name in entity_atlas_name:
         if verbose:
             print(f"Transforming {name}")
 
@@ -1066,7 +1144,7 @@ def htan2fhir(verbose):
                     if encounter:
                         encounters.append(encounter)
                         condition = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter,
-                                                                         body_structure=None)
+                                                                         body_structure=None, stage_observation=None)
 
                         if condition:
                             conditions.append(condition)
@@ -1076,7 +1154,8 @@ def htan2fhir(verbose):
                                                                                            official_focus="Condition",
                                                                                            focus=[Reference(**{
                                                                                                "reference": f"Condition/{condition.id}"})],
-                                                                                           specimen=None, components=None,
+                                                                                           specimen=None,
+                                                                                           components=None,
                                                                                            category=None,
                                                                                            relax=False)
                             if condition_observation:
@@ -1092,7 +1171,8 @@ def htan2fhir(verbose):
                                                                                            focus=[Reference(**{
                                                                                                "reference": f"MedicationAdministration/{med_admin.id}"})],
                                                                                            patient_id=patient.id,
-                                                                                           specimen=None, components=None,
+                                                                                           specimen=None,
+                                                                                           components=None,
                                                                                            category=None,
                                                                                            relax=False)
                             if med_admin_observation:
@@ -1172,4 +1252,4 @@ def htan2fhir(verbose):
         if med_admins:
             transformer.write_ndjson(med_admins)
 
-# for i in $(ls projects/HTAN); do g3t meta validate projects/HTAN/$i/META; done
\ No newline at end of file
+# for i in $(ls projects/HTAN); do g3t meta validate projects/HTAN/$i/META; done

From cc2ebf941e3102b8ec8e8628f8fa922faf2434a0 Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Thu, 10 Oct 2024 13:08:02 -0700
Subject: [PATCH 23/24] stage observations + check medication code

---
 README.md             |   6 ++
 fhirizer/htan2fhir.py | 173 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 160 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index e5a89e6..bfbba2f 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,12 @@ Example run:
   ```
    fhirizer generate --name htan 
   ```
+
+G3T validate FHIRized ndjson files: 
+```commandline
+for i in $(ls projects/HTAN); do echo $i && g3t meta validate projects/HTAN/$i/META; done
+```
+
 ### Constructing GDC maps cli cmds 
 
 initialize initial structure of project, case, or file to add Maps
diff --git a/fhirizer/htan2fhir.py b/fhirizer/htan2fhir.py
index e2e63a5..8dd1167 100644
--- a/fhirizer/htan2fhir.py
+++ b/fhirizer/htan2fhir.py
@@ -470,7 +470,10 @@ def create_medication(self, compound_name: Optional[str], treatment_type: Option
                           _substance: Optional[Substance]) -> Medication:
         code = None
         med_identifier = None
+
         if compound_name:
+            if ":" in compound_name:
+                compound_name.replace(":", "_")
             code = CodeableConcept(**{"coding": [
                 {"code": compound_name, "system": "/".join([self.SYSTEM_chEMBL, "compound_name"]),
                  "display": compound_name}]})
@@ -478,6 +481,9 @@ def create_medication(self, compound_name: Optional[str], treatment_type: Option
             med_identifier = Identifier(
                 **{"system": self.SYSTEM_chEMBL, "value": compound_name, "use": "official"})
         else:
+            if ":" in treatment_type:
+                treatment_type.replace(":", "_")
+
             code = CodeableConcept(**{"coding": [
                 {"code": treatment_type, "system": "/".join([self.SYSTEM_HTAN, "treatment_type"]),
                  "display": treatment_type}]})
@@ -516,6 +522,11 @@ def transform_medication(self, cases: pd.DataFrame, db_file_path: str) -> pd.Dat
             cases["Therapeutic Agents"] = cases["Therapeutic Agents"].str.upper()
             drug_names = list(cases["Therapeutic Agents"][~cases["Therapeutic Agents"].isna()].unique())
             # drug_names = [d.upper() for d in drug_names]
+
+            for drug_name in drug_names:
+                if ":" in drug_name:
+                    drug_name.replace(":", "_")
+
             dat = self.get_chembl_compound_info(db_file_path=db_file_path, drug_names=drug_names, limit=1000)
             drug_df = pd.DataFrame(dat)
             drug_df.columns = ["CHEMBL_ID", "STANDARD_INCHI", "CANONICAL_SMILES", "COMPOUND_NAME"]
@@ -740,10 +751,10 @@ def create_body_structure(self, _row, patient: Patient) -> BodyStructure:
                })
 
     def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encounter,
-                         body_structure: Optional[BodyStructure], stage_observation: Optional[Observation]) -> Optional[Condition]:
+                         body_structure: Optional[BodyStructure], stage_observation: Optional[Observation]) -> dict:
         primary_diagnosis = _row.get("Primary Diagnosis")
         if pd.isnull(primary_diagnosis):
-            return None
+            return {}
 
         # identifier string = project / patient / primary diagnosis
         condition_identifier = Identifier(**{"system": self.SYSTEM_HTAN,
@@ -785,12 +796,8 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
             patient_body_site_cc = [CodeableConcept(**{"coding": [{"code": patient_body_site,
                                                                    "system": self.SYSTEM_HTAN,
                                                                    "display": patient_body_site}]})]
-        condition_stage = []
-        stages = self.create_stage(_row=_row, stage_observation=None)
-        if stages:
-            condition_stage = stages
 
-        return Condition(**{"id": condition_id,
+        condition = Condition(**{"id": condition_id,
                             "identifier": [condition_identifier],
                             "code": CodeableConcept(**{"coding": [{"code": primary_diagnosis,
                                                                    "system": self.SYSTEM_HTAN,
@@ -803,10 +810,20 @@ def create_condition(self, _row: pd.Series, patient: Patient, encounter: Encount
                             "recordedDate": recorded_date,
                             "bodySite": patient_body_site_cc,
                             # "bodyStructure": patient_body_structure_ref,
-                            "encounter": Reference(**{"reference": f"Encounter/{encounter.id}"}),
-                            "stage": condition_stage,
+                            "encounter": Reference(**{"reference": f"Encounter/{encounter.id}"})
                             })
 
+        stage_observations_dict = self.create_stage_observation(_row=_row, condition=condition, patient=patient)
+
+        condition_stage = []
+        stages = self.create_stage(_row=_row, stage_observations_dict=stage_observations_dict)
+        if stages:
+            condition_stage = stages
+
+        condition.stage = condition_stage
+
+        return {"condition": condition, "stage_observations_dict": stage_observations_dict}
+
     def create_medication_administration(self, _row: pd.Series, patient_id: str) -> MedicationAdministration:
         # if Treatment Type exists - make MedicationAdministration
         # if Days to Treatment End, then status -> completed, else status unknown
@@ -859,11 +876,8 @@ def create_medication_administration(self, _row: pd.Series, patient_id: str) ->
 
         return MedicationAdministration(**data)
 
-    def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation]) -> list:
-
+    def create_stage(self, _row: pd.Series, stage_observations_dict: dict) -> list:
         assessment = []
-        if stage_observation:
-            assessment.append(Reference(**{"reference": f"Observation/{stage_observation.id}"}))
 
         # find fields w Condition.stage.summary mappings
         cancer_pathological_staging = utils._read_json(str(Path(importlib.resources.files(
@@ -873,6 +887,7 @@ def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation]
         for field, fhir_map, use, focus in self.get_fields_by_fhir_map(self.cases_mappings(),
                                                                        "Condition.stage.summary"):
             if "Tumor Grade" in field or "AJCC Pathologic" in field:
+                # TODO: check for 8th/other edition
                 stage_fields.append(field)
 
         if stage_fields:
@@ -880,6 +895,10 @@ def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation]
 
         stages = []
         for stage_field in stage_fields:
+            stage_name = "_".join(stage_field.lower().split(" "))
+            stage_observation = stage_observations_dict.get(stage_name)
+            if stage_observation:
+                assessment = [Reference(**{"reference": f"Observation/{stage_observation.id}"})]
             if not pd.isnull(_row[stage_field]):
 
                 types = []
@@ -891,7 +910,8 @@ def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation]
                                        "display": stage_info["stage_type_sctid_display"]}
 
                         summary_htan_system = {"code": _row[stage_field],
-                                               "system": "/".join([self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]),
+                                               "system": "/".join(
+                                                   [self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]),
                                                "display": _row[stage_field]}
 
                         summary_snomed_system = {"code": stage_info["sctid"],
@@ -903,7 +923,7 @@ def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation]
                         summaries.append(summary_snomed_system)
                 if not types:
                     types.append({"code": "_".join(stage_field.lower().split(" ")),
-                                  "system": "/".join([self.SYSTEM_HTAN,  "_".join(stage_field.lower().split(" "))]),
+                                  "system": "/".join([self.SYSTEM_HTAN, "_".join(stage_field.lower().split(" "))]),
                                   "display": "_".join(stage_field.lower().split(" "))})
 
                     summaries.append({"code": _row[stage_field],
@@ -919,6 +939,116 @@ def create_stage(self, _row: pd.Series, stage_observation: Optional[Observation]
 
         return stages
 
+    def create_stage_observation(self, _row: pd.Series, condition: Condition, patient: Patient) -> dict:
+        observation_dict = {}
+
+        # find fields w Condition.stage.summary mappings
+        cancer_pathological_staging = utils._read_json(str(Path(importlib.resources.files(
+            'fhirizer').parent / 'resources' / 'gdc_resources' / 'content_annotations' / 'diagnosis' / 'cancer_pathological_staging.json')))
+
+        ajcc_pathologic_stage_fields = []
+        grade_stage_fields = []
+        for field, fhir_map, use, focus in self.get_fields_by_fhir_map(self.cases_mappings(),
+                                                                       "Condition.stage.summary"):
+            if "AJCC Pathologic" in field:
+                # TODO: check for 8th/other edition
+                ajcc_pathologic_stage_fields.append(field)
+            elif "Tumor Grade" in field:
+                grade_stage_fields.append(field)
+
+        _ajcc_pathologic_stage = None
+        if pd.notna(ajcc_pathologic_stage_fields).all():
+            _ajcc_pathologic_stage = _row[ajcc_pathologic_stage_fields]
+
+        member = []
+        if pd.notna(ajcc_pathologic_stage_fields).all():
+            # print(_ajcc_pathologic_stage, type(_ajcc_pathologic_stage))
+            for col_name, value in _ajcc_pathologic_stage.items():
+                # "these are children stages and are members"
+                if value and col_name != "AJCC Pathologic Stage":
+                    stage = "_".join(col_name.lower().split(" "))
+                    identifier_value = "-".join([patient.identifier[0].value, condition.id, stage])
+                    observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN,
+                                                           "use": "official",
+                                                           "value": identifier_value})
+                    observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation",
+                                                  project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+
+                    code = None
+                    value_code = None
+                    for stage_info in cancer_pathological_staging:
+                        if value == stage_info["value"]:
+                            code = CodeableConcept(**{"coding": [{"code": stage_info["stage_type_sctid"],
+                                                                  "system": self.SYSTEM_SNOME,
+                                                                  "display": stage_info["stage_type_sctid_display"]}]})
+
+                            value_code = CodeableConcept(**{"coding": [{"code": stage_info["sctid"],
+                                                                        "system": self.SYSTEM_SNOME,
+                                                                        "display": stage_info["sctid_display"]}]})
+                    if not code:
+                        code = CodeableConcept(**{"coding": [{"code": stage,
+                                                              "system": "/".join([self.SYSTEM_HTAN, stage]),
+                                                              "display": stage}]})
+
+                        value_code = CodeableConcept(**{"coding": [{"code": value,
+                                                                    "system": "/".join(
+                                                                        [self.SYSTEM_HTAN, stage]),
+                                                                    "display": value}]})
+
+                    _stage_observation = Observation(**{"id": observation_id,
+                                                        "identifier": [observation_identifier],
+                                                        "status": "final",
+                                                        "code": code,
+                                                        "subject": Reference(**{"reference": f"Patient/{patient.id}"}),
+                                                        "focus": [
+                                                            Reference(**{"reference": f"Condition/{condition.id}"})],
+                                                        "valueCodeableConcept": value_code})
+                    observation_dict.update({stage: _stage_observation})
+                    member.append(Reference(**{"reference": f"Observation/{_stage_observation.id}"}))
+            # print(member)
+
+            if not pd.isnull(_row["AJCC Pathologic Stage"]):
+                stage = "_".join("AJCC Pathologic Stage".lower().split(" "))
+                identifier_value = "-".join([patient.identifier[0].value, condition.id, stage])
+                observation_identifier = Identifier(**{"system": self.SYSTEM_HTAN,
+                                                       "use": "official",
+                                                       "value": identifier_value})
+                observation_id = self.mint_id(identifier=observation_identifier, resource_type="Observation",
+                                              project_id=self.project_id, namespace=self.NAMESPACE_HTAN)
+
+                code = None
+                value_code = None
+                for stage_info in cancer_pathological_staging:
+                    if _row["AJCC Pathologic Stage"] == stage_info["value"]:
+                        code = CodeableConcept(**{"coding": [{"code": stage_info["stage_type_sctid"],
+                                                              "system": self.SYSTEM_SNOME,
+                                                              "display": stage_info["stage_type_sctid_display"]}]})
+
+                        value_code = CodeableConcept(**{"coding": [{"code": stage_info["sctid"],
+                                                                    "system": self.SYSTEM_SNOME,
+                                                                    "display": stage_info["sctid_display"]}]})
+                if not code:
+                    code = CodeableConcept(**{"coding": [{"code": stage,
+                                                          "system": "/".join([self.SYSTEM_HTAN, stage]),
+                                                          "display": stage}]})
+
+                    value_code = CodeableConcept(**{"coding": [{"code": _row["AJCC Pathologic Stage"],
+                                                                "system": "/".join(
+                                                                    [self.SYSTEM_HTAN, stage]),
+                                                                "display": _row["AJCC Pathologic Stage"]}]})
+
+                _stage_observation = Observation(**{"id": observation_id,
+                                                    "identifier": [observation_identifier],
+                                                    "status": "final",
+                                                    "code": code,
+                                                    "subject": Reference(**{"reference": f"Patient/{patient.id}"}),
+                                                    "focus": [Reference(**{"reference": f"Condition/{condition.id}"})],
+                                                    "valueCodeableConcept": value_code,
+                                                    "hasMember": member})
+                observation_dict.update({stage: _stage_observation})
+
+        return observation_dict
+
 
 class SpecimenTransformer(HTANTransformer):
     def __init__(self, *args: Any, **kwargs: Any):
@@ -1143,17 +1273,22 @@ def htan2fhir(verbose, entity_atlas_name):
                                                                      procedure=None)
                     if encounter:
                         encounters.append(encounter)
-                        condition = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter,
+                        condition_dict = patient_transformer.create_condition(_row=row, patient=patient, encounter=encounter,
                                                                          body_structure=None, stage_observation=None)
 
-                        if condition:
-                            conditions.append(condition)
+                        if condition_dict and condition_dict["condition"]:
+                            conditions.append(condition_dict["condition"])
+
+                            if condition_dict["stage_observations_dict"]:
+                                for key, obs_item in condition_dict["stage_observations_dict"].items():
+                                    if obs_item:
+                                        observations.append(obs_item)
 
                             condition_observation = patient_transformer.create_observation(_row=row, patient=patient,
                                                                                            patient_id=patient.id,
                                                                                            official_focus="Condition",
                                                                                            focus=[Reference(**{
-                                                                                               "reference": f"Condition/{condition.id}"})],
+                                                                                               "reference": f"Condition/{condition_dict["condition"].id}"})],
                                                                                            specimen=None,
                                                                                            components=None,
                                                                                            category=None,

From ba2650c1f0e9690f07f139cb544668ac769da82a Mon Sep 17 00:00:00 2001
From: teslajoy <nasim@plenary.org>
Date: Fri, 11 Oct 2024 05:47:51 -0700
Subject: [PATCH 24/24] enable one-many atalas names to be passed

---
 README.md       |  5 +++++
 fhirizer/cli.py | 15 +++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bfbba2f..bf26321 100644
--- a/README.md
+++ b/README.md
@@ -102,9 +102,14 @@ fhirizer/
 
 Example run: 
 
+for all available atlases under ./projects/HTAN/<Atlas name>
   ```
    fhirizer generate --name htan 
   ```
+or for one or more: 
+```commandline
+fhirizer generate --name htan --atlas "OHSU,DFCI,WUSTL,BU,CHOP"
+```
 
 G3T validate FHIRized ndjson files: 
 ```commandline
diff --git a/fhirizer/cli.py b/fhirizer/cli.py
index 4f0324b..bdaab62 100644
--- a/fhirizer/cli.py
+++ b/fhirizer/cli.py
@@ -149,12 +149,16 @@ def convert(name, in_path, out_path, verbose):
               not_required_if=['htan', 'icgc'],
               help='Path to GDC entity with mapped FHIR like keys (converted file via convert) or Cellosaurus ndjson '
                    'file of human cell-lines of interest.')
+@click.option('--atlas', required=False,
+              default=['OHSU'],
+              show_default=True,
+              help='List of atlas project(s) name to FHIRize. ex. ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford"]')
 @click.option('--icgc', help='Name of the ICGC project to FHIRize.')
 @click.option('--has_files', is_flag=True, help='Boolean indicating file metatda via new argo site is available @ '
                                                 'ICGC/{project}/data directory to FHIRize.')
 @click.option('--convert', is_flag=True, help='Boolean indicating to write converted keys to directory')
 @click.option('--verbose', is_flag=True)
-def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose):
+def generate(name, out_dir, entity_path, icgc, has_files, atlas, convert, verbose):
     name_list = ['project', 'case', 'file', 'cellosaurus', 'icgc', 'htan']
     assert name in name_list, f'--name is not in {name_list}.'
     if name != 'htan':
@@ -174,7 +178,14 @@ def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose):
     if name in 'icgc' and icgc:
         icgc2fhir.icgc2fhir(project_name=icgc, has_files=has_files)
     if name in 'htan':
-        htan2fhir.htan2fhir(verbose=verbose)
+        if isinstance(atlas, str):
+            if "," in atlas:
+                atlas = atlas.split(",")
+                atlas = [a.strip() for a in atlas]
+            else:
+                atlas = [atlas]
+
+        htan2fhir.htan2fhir(entity_atlas_name=atlas, verbose=verbose)
 
 
 if __name__ == '__main__':