Skip to content

Commit

Permalink
Merge pull request #32 from bmeg/development
Browse files Browse the repository at this point in the history
gdc, cellosaurus, and icgc mintids - harmonization
  • Loading branch information
teslajoy authored Jul 10, 2024
2 parents d1e4b11 + 0cc29aa commit a1e9229
Show file tree
Hide file tree
Showing 27 changed files with 1,446 additions and 14,103 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
![mapping](./imgs/fhir_flame.png)


### Project overview:
Mapping GDC (Genomic Data Commons) schema or Cellosaurus cell-lines to FHIR (Fast Healthcare Interoperability Resources).
### Project overview:
Transforms and harmonizes data from Genomic Data Commons (GDC), Cellosaurus cell-lines, and International Cancer Genome Consortium (ICGC) repositories into 🔥 FHIR (Fast Healthcare Interoperability Resources) format.

- #### GDC study simplified FHIR graph
![mapping](./imgs/gdc_tcga_study_example_fhir_graph.png)
Expand Down
902 changes: 627 additions & 275 deletions fhirizer/entity2fhir.py

Large diffs are not rendered by default.

351 changes: 275 additions & 76 deletions fhirizer/icgc2fhir.py

Large diffs are not rendered by default.

81 changes: 81 additions & 0 deletions fhirizer/labels/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,15 @@
)
),

Map(
source=Source(
name='samples.updated_datetime'
),
destination=Destination(
name='Observation.sample.updated_datetime'
)
),

Map(
source=Source(
name='samples.is_ffpe'
Expand Down Expand Up @@ -678,6 +687,15 @@
)
),

Map(
source=Source(
name='samples.portions.updated_datetime'
),
destination=Destination(
name='Observation.portion.updated_datetime'
)
),

Map(
source=Source(
name='samples.days_to_collection'
Expand Down Expand Up @@ -732,6 +750,15 @@
)
),

Map(
source=Source(
name='samples.portions.analytes.updated_datetime'
),
destination=Destination(
name='Observation.analyte.updated_datetime'
)
),

Map(
source=Source(
name='samples.portions.analytes.normal_tumor_genotype_snp_match'
Expand Down Expand Up @@ -957,6 +984,15 @@
)
),

Map(
source=Source(
name='samples.portions.analytes.aliquots.updated_datetime'
),
destination=Destination(
name='Observation.aliquot.updated_datetime'
)
),

Map(
source=Source(
name='samples.portions.analytes.aliquots.aliquot_id'
Expand Down Expand Up @@ -1065,6 +1101,15 @@
)
),

Map(
source=Source(
name='diagnoses.tumor_grade'
),
destination=Destination(
name='Observation.code.nci_tumor_grade'
)
),

Map(
source=Source(
name='diagnoses.treatments.treatment_id'
Expand Down Expand Up @@ -1164,6 +1209,42 @@
)
),

Map(
source=Source(
name='diagnoses.days_to_death'
),
destination=Destination(
name='Observation.survey.days_to_death'
)
),

Map(
source=Source(
name='diagnoses.days_to_last_follow_up'
),
destination=Destination(
name='Observation.survey.days_to_last_follow_up'
)
),

Map(
source=Source(
name='diagnoses.updated_datetime'
),
destination=Destination(
name='Observation.survey.updated_datetime'
)
),

Map(
source=Source(
name='submitter_diagnosis_ids'
),
destination=Destination(
name='Condition.identifier'
)
),

# project Maps -----------------------------------------------------
Map(
source=Source(
Expand Down
28 changes: 25 additions & 3 deletions fhirizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
import json
import glob
import gzip
import uuid
import pprint
import requests
from bs4 import BeautifulSoup
from fhirizer.schema import Schema
from importlib.resources import files
import importlib
from pathlib import Path
from fhir.resources.identifier import Identifier
from uuid import uuid5, UUID

DATA_DICT_PATH = "".join([str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'data_dictionary')), "/"])
FIELDS_PATH = "".join([str(Path(importlib.resources.files('fhirizer').parent / 'resources' / 'gdc_resources' / 'fields')), "/"])
Expand Down Expand Up @@ -1069,7 +1072,7 @@ def ncit2mondo(path):
return data


def get_component(key, value=None, component_type=None):
def get_component(key, value=None, component_type=None, system="https://cadsr.cancer.gov/sample_laboratory_observation"):
if component_type == 'string':
value = {"valueString": value}
elif component_type == 'int':
Expand All @@ -1078,14 +1081,16 @@ def get_component(key, value=None, component_type=None):
value = {"valueQuantity": {"value": value}}
elif component_type == 'bool':
value = {"valueBoolean": value}
elif component_type == 'dateTime':
value = {"valueDateTime": value}
else:
pass

component = {
"code": {
"coding": [
{
"system": "https://cadsr.cancer.gov/sample_laboratory_observation",
"system": system,
"code": key,
"display": key
}
Expand All @@ -1098,10 +1103,27 @@ def get_component(key, value=None, component_type=None):

return component


def fhir_ndjson(entity, out_path):
if isinstance(entity, list):
with open(out_path, 'w', encoding='utf8') as file:
file.write('\n'.join(map(lambda e: json.dumps(e, ensure_ascii=False), entity)))
else:
with open(out_path, 'w', encoding='utf8') as file:
file.write(json.dumps(entity, ensure_ascii=False))
file.write(json.dumps(entity, ensure_ascii=False))


def mint_id(identifier, resource_type, project_id, namespace) -> str:
"""Create a UUID from an identifier. - mint id via Walsh's convention
https://github.com/ACED-IDP/g3t_etl/blob/d095895b0cf594c2fd32b400e6f7b4f9384853e2/g3t_etl/__init__.py#L61"""

if isinstance(identifier, tuple): # Check if identifier is a tuple
assert resource_type, "resource_type is required for Identifier"
identifier = f"{resource_type}/{identifier[0]}|{identifier[1]}"
return _mint_id(identifier, project_id, namespace)


def _mint_id(identifier_string: str, project_id: str, namespace: UUID) -> str:
"""Create a UUID from an identifier, insert project_id."""
return str(uuid5(namespace, f"{project_id}/{identifier_string}"))

Loading

0 comments on commit a1e9229

Please sign in to comment.