Merge pull request #32 from bmeg/development

gdc, cellosaurus, and icgc mintids - harmonization
bmeg · Jul 10, 2024 · a1e9229 · a1e9229
2 parents d1e4b11 + 0cc29aa
commit a1e9229
Show file tree

Hide file tree

Showing 27 changed files with 1,446 additions and 14,103 deletions.
diff --git a/README.md b/README.md
@@ -7,8 +7,8 @@
 ![mapping](./imgs/fhir_flame.png)
 
 
-### Project overview: 
-Mapping GDC (Genomic Data Commons) schema or Cellosaurus cell-lines to FHIR (Fast Healthcare Interoperability Resources).
+### Project overview:
+Transforms and harmonizes data from Genomic Data Commons (GDC), Cellosaurus cell-lines, and International Cancer Genome Consortium (ICGC) repositories into 🔥 FHIR (Fast Healthcare Interoperability Resources) format.
 
 - #### GDC study simplified FHIR graph 
 ![mapping](./imgs/gdc_tcga_study_example_fhir_graph.png)

diff --git a/fhirizer/entity2fhir.py b/fhirizer/entity2fhir.py
diff --git a/fhirizer/icgc2fhir.py b/fhirizer/icgc2fhir.py
diff --git a/fhirizer/labels/case.py b/fhirizer/labels/case.py
@@ -633,6 +633,15 @@
         )
     ),
 
+    Map(
+        source=Source(
+            name='samples.updated_datetime'
+        ),
+        destination=Destination(
+            name='Observation.sample.updated_datetime'
+        )
+    ),
+
     Map(
         source=Source(
             name='samples.is_ffpe'
@@ -678,6 +687,15 @@
         )
     ),
 
+    Map(
+        source=Source(
+            name='samples.portions.updated_datetime'
+        ),
+        destination=Destination(
+            name='Observation.portion.updated_datetime'
+        )
+    ),
+
     Map(
         source=Source(
             name='samples.days_to_collection'
@@ -732,6 +750,15 @@
         )
     ),
 
+    Map(
+        source=Source(
+            name='samples.portions.analytes.updated_datetime'
+        ),
+        destination=Destination(
+            name='Observation.analyte.updated_datetime'
+        )
+    ),
+
     Map(
         source=Source(
             name='samples.portions.analytes.normal_tumor_genotype_snp_match'
@@ -957,6 +984,15 @@
         )
     ),
 
+    Map(
+        source=Source(
+            name='samples.portions.analytes.aliquots.updated_datetime'
+        ),
+        destination=Destination(
+            name='Observation.aliquot.updated_datetime'
+        )
+    ),
+
     Map(
         source=Source(
             name='samples.portions.analytes.aliquots.aliquot_id'
@@ -1065,6 +1101,15 @@
         )
     ),
 
+    Map(
+        source=Source(
+            name='diagnoses.tumor_grade'
+        ),
+        destination=Destination(
+            name='Observation.code.nci_tumor_grade'
+        )
+    ),
+
     Map(
         source=Source(
             name='diagnoses.treatments.treatment_id'
@@ -1164,6 +1209,42 @@
         )
     ),
 
+    Map(
+        source=Source(
+            name='diagnoses.days_to_death'
+        ),
+        destination=Destination(
+            name='Observation.survey.days_to_death'
+        )
+    ),
+
+    Map(
+        source=Source(
+            name='diagnoses.days_to_last_follow_up'
+        ),
+        destination=Destination(
+            name='Observation.survey.days_to_last_follow_up'
+        )
+    ),
+
+    Map(
+        source=Source(
+            name='diagnoses.updated_datetime'
+        ),
+        destination=Destination(
+            name='Observation.survey.updated_datetime'
+        )
+    ),
+
+    Map(
+        source=Source(
+            name='submitter_diagnosis_ids'
+        ),
+        destination=Destination(
+            name='Condition.identifier'
+        )
+    ),
+
     # project Maps -----------------------------------------------------
     Map(
         source=Source(

diff --git a/fhirizer/utils.py b/fhirizer/utils.py
@@ -4,13 +4,16 @@
 import json
 import glob
 import gzip
+import uuid
 import pprint
 import requests
 from bs4 import BeautifulSoup
 from fhirizer.schema import Schema
 from importlib.resources import files
 import importlib
 from pathlib import Path
+from fhir.resources.identifier import Identifier
+from uuid import uuid5, UUID
 
 DATA_DICT_PATH = "".join([str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'data_dictionary')), "/"])
 FIELDS_PATH = "".join([str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'fields')), "/"])
@@ -1069,7 +1072,7 @@ def ncit2mondo(path):
         return data
 
 
-def get_component(key, value=None, component_type=None):
+def get_component(key, value=None, component_type=None, system="https://cadsr.cancer.gov/sample_laboratory_observation"):
     if component_type == 'string':
         value = {"valueString": value}
     elif component_type == 'int':
@@ -1078,14 +1081,16 @@ def get_component(key, value=None, component_type=None):
         value = {"valueQuantity": {"value": value}}
     elif component_type == 'bool':
         value = {"valueBoolean": value}
+    elif component_type == 'dateTime':
+        value = {"valueDateTime": value}
     else:
         pass
 
     component = {
         "code": {
             "coding": [
                 {
-                    "system": "https://cadsr.cancer.gov/sample_laboratory_observation",
+                    "system": system,
                     "code": key,
                     "display": key
                 }
@@ -1098,10 +1103,27 @@ def get_component(key, value=None, component_type=None):
 
     return component
 
+
 def fhir_ndjson(entity, out_path):
     if isinstance(entity, list):
         with open(out_path, 'w', encoding='utf8') as file:
             file.write('\n'.join(map(lambda e: json.dumps(e, ensure_ascii=False), entity)))
     else:
         with open(out_path, 'w', encoding='utf8') as file:
-            file.write(json.dumps(entity, ensure_ascii=False))
+            file.write(json.dumps(entity, ensure_ascii=False))
+
+
+def mint_id(identifier, resource_type, project_id, namespace) -> str:
+    """Create a UUID from an identifier. - mint id via Walsh's convention
+    https://github.com/ACED-IDP/g3t_etl/blob/d095895b0cf594c2fd32b400e6f7b4f9384853e2/g3t_etl/__init__.py#L61"""
+
+    if isinstance(identifier, tuple):  # Check if identifier is a tuple
+        assert resource_type, "resource_type is required for Identifier"
+        identifier = f"{resource_type}/{identifier[0]}|{identifier[1]}"
+    return _mint_id(identifier, project_id, namespace)
+
+
+def _mint_id(identifier_string: str, project_id: str, namespace: UUID) -> str:
+    """Create a UUID from an identifier, insert project_id."""
+    return str(uuid5(namespace, f"{project_id}/{identifier_string}"))
+