Skip to content

Commit

Permalink
Merge pull request #49 from bmeg/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
teslajoy authored Oct 11, 2024
2 parents 5a70a7d + 4b501d2 commit 3194f22
Show file tree
Hide file tree
Showing 10 changed files with 3,651 additions and 36 deletions.
60 changes: 56 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


### Project overview:
Transforms and harmonizes data from Genomic Data Commons (GDC), Cellosaurus cell-lines, and International Cancer Genome Consortium (ICGC) repositories into 🔥 FHIR (Fast Healthcare Interoperability Resources) format.
Transforms and harmonizes data from Genomic Data Commons (GDC), Cellosaurus cell-lines, International Cancer Genome Consortium (ICGC), and Human Tumor Atlas Network (HTAN) repositories into 🔥 FHIR (Fast Healthcare Interoperability Resources) format.

- #### GDC study simplified FHIR graph
![mapping](./imgs/gdc_tcga_study_example_fhir_graph.png)
Expand Down Expand Up @@ -75,6 +75,47 @@ Detailed step-by-step guide on FHIRizing data for a project's study can be found
```
fhirizer generate --name icgc --icgc <ICGC_project_name> --has_files
```
- HTAN

FHIRizing HTAN depends on the:
1. Folder hierarchy with naming conventions as below and existance of raw data pulled from HTAN
```
fhirizer/
|-- projects/
| └── HTAN/
| └── OHSU/
| |-- raw/
| | |-- files/
| | | |-- table_data.tsv
| | | └── cds_manifest.csv
| | |-- biospecimens/table_data.tsv
| | └── cases/table_data.tsv
| └── META/
```
2. existance of chembl DB file
```
fhirizer/
|-- resources/
└── chembl_resources/chembl_34.db
```

Example run:

for all available atlases under ./projects/HTAN/<Atlas name>
```
fhirizer generate --name htan
```
or for one or more:
```commandline
fhirizer generate --name htan --atlas "OHSU,DFCI,WUSTL,BU,CHOP"
```

G3T validate FHIRized ndjson files:
```commandline
for i in $(ls projects/HTAN); do echo $i && g3t meta validate projects/HTAN/$i/META; done
```

### Constructing GDC maps cli cmds

initialize initial structure of project, case, or file to add Maps
Expand Down Expand Up @@ -145,9 +186,20 @@ fhirizer/
| | |-- filess.ndjson
| | └── META/
| └── ICGC/
| └── ICGC-STUDY/
| |-- data/
| └── META/
| | └── ICGC-STUDY/
| | |-- data/
| | └── META/
| └── HTAN/
| └── OHSU/
| |-- raw/
| | |-- files/
| | | |-- table_data.tsv
| | | └── cds_manifest.csv
| | |-- biospecimens/table_data.tsv
| | └── cases/table_data.tsv
| └── META/
|
|
|--README.md
└── setup.py
```
48 changes: 33 additions & 15 deletions fhirizer/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from fhirizer import utils, mapping, entity2fhir
from fhirizer import icgc2fhir
from fhirizer import utils, mapping, entity2fhir, icgc2fhir, htan2fhir
import click
from pathlib import Path

Expand All @@ -8,28 +7,31 @@ class NotRequiredIf(click.Option):
def __init__(self, *args, **kwargs):
self.not_required_if = kwargs.pop('not_required_if')
assert self.not_required_if, "'not_required_if' parameter required"
if isinstance(self.not_required_if, str):
self.not_required_if = [self.not_required_if]
kwargs['help'] = (kwargs.get('help', '') +
' NOTE: This argument is mutually exclusive with %s' %
self.not_required_if
', '.join(self.not_required_if)
).strip()
super(NotRequiredIf, self).__init__(*args, **kwargs)

def handle_parse_result(self, ctx, opts, args):
we_are_present = self.name in opts
other_present = self.not_required_if in opts
others_present = [opt for opt in self.not_required_if if opt in opts]

if other_present:
if others_present:
if we_are_present:
raise click.UsageError(
"Illegal usage: `%s` is mutually exclusive with `%s`" % (
self.name, self.not_required_if))
self.name, ', '.join(others_present)))
else:
self.prompt = None

return super(NotRequiredIf, self).handle_parse_result(
ctx, opts, args)



@click.group()
def cli():
"""GDC, Cellosaurus, ICGC to FHIR schema Key and Content Mapping"""
Expand Down Expand Up @@ -141,22 +143,29 @@ def convert(name, in_path, out_path, verbose):
show_default=True,
help='entity name to map - project, case, file of GDC or cellosaurus')
@click.option('--out_dir', cls=NotRequiredIf,
not_required_if='icgc',
not_required_if=['htan', 'icgc'],
help='Directory path to save mapped FHIR ndjson files.')
@click.option('--entity_path', cls=NotRequiredIf,
not_required_if='icgc',
help='Path to GDC entity with mapped FHIR like keys (converted file via convert). '
'or Cellosaurus ndjson file of human cell-lines of interest')
not_required_if=['htan', 'icgc'],
help='Path to GDC entity with mapped FHIR like keys (converted file via convert) or Cellosaurus ndjson '
'file of human cell-lines of interest.')
@click.option('--atlas', required=False,
default=['OHSU'],
show_default=True,
help='List of atlas project(s) name to FHIRize. ex. ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford"]')
@click.option('--icgc', help='Name of the ICGC project to FHIRize.')
@click.option('--has_files', is_flag=True, help='Boolean indicating file metatda via new argo site is available @ '
'ICGC/{project}/data directory to FHIRize.')
@click.option('--convert', is_flag=True, help='Boolean indicating to write converted keys to directory')
@click.option('--verbose', is_flag=True)
def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose):
name_list = ['project', 'case', 'file', 'cellosaurus', 'icgc']
assert name in ['project', 'case', 'file', 'cellosaurus', 'icgc'], f'--name is not in {name_list}.'
assert Path(out_dir).is_dir(), f"Path {out_dir} is not a valid directory path."
assert Path(entity_path).is_file(), f"Path {entity_path} is not a valid file path."
def generate(name, out_dir, entity_path, icgc, has_files, atlas, convert, verbose):
name_list = ['project', 'case', 'file', 'cellosaurus', 'icgc', 'htan']
assert name in name_list, f'--name is not in {name_list}.'
if name != 'htan':
assert Path(out_dir).is_dir(), f"Path {out_dir} is not a valid directory path."
assert Path(entity_path).is_file(), f"Path {entity_path} is not a valid file path."
else:
assert Path("./projects/HTAN").is_dir()

if name in 'project':
entity2fhir.project_gdc_to_fhir_ndjson(out_dir=out_dir, projects_path=entity_path, convert=convert, verbose=verbose)
Expand All @@ -168,6 +177,15 @@ def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose):
entity2fhir.cellosaurus2fhir(out_dir=out_dir, path=entity_path)
if name in 'icgc' and icgc:
icgc2fhir.icgc2fhir(project_name=icgc, has_files=has_files)
if name in 'htan':
if isinstance(atlas, str):
if "," in atlas:
atlas = atlas.split(",")
atlas = [a.strip() for a in atlas]
else:
atlas = [atlas]

htan2fhir.htan2fhir(entity_atlas_name=atlas, verbose=verbose)


if __name__ == '__main__':
Expand Down
6 changes: 5 additions & 1 deletion fhirizer/entity2fhir.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def assign_fhir_for_project(project, disease_types=disease_types):
pr_ident = Identifier(**{"system": "".join(["https://gdc.cancer.gov/", "program_id"]),
"value": project['ResearchStudy']['ResearchStudy.id']})
pl.append(pr_ident)

rs.identifier = [pr_ident]
rs.id = utils.mint_id(
identifier=pr_ident,
resource_type="ResearchStudy",
Expand All @@ -94,6 +94,7 @@ def assign_fhir_for_project(project, disease_types=disease_types):
else:
p_ident = Identifier(**{"system": "".join(["https://gdc.cancer.gov/", "project_id"]),
"value": project['ResearchStudy.id']})
rs.identifier = [p_ident]
rs.id = utils.mint_id(
identifier=p_ident,
resource_type="ResearchStudy",
Expand Down Expand Up @@ -157,6 +158,7 @@ def assign_fhir_for_project(project, disease_types=disease_types):

ref = Reference(**{"reference": "/".join(["ResearchStudy", rs_parent.id])})
rs.partOf = [ref]

# condition -- subject --> patient <--subject-- researchsubject -- study --> researchstudy -- partOf --> researchstudy

return {'ResearchStudy': rs.json(), "ResearchStudy.partOf": rs_parent.json(), 'ResearchStudy_obj': rs,
Expand Down Expand Up @@ -394,6 +396,7 @@ def assign_fhir_for_case(case, disease_types=disease_types, primary_sites=primar
research_subject.status = "active"
research_subject.study = study_ref
research_subject.subject = subject_ref
research_subject.identifier = [patient_id_identifier]
research_subject.id = utils.mint_id(
identifier=patient_id_identifier,
resource_type="ResearchSubject",
Expand Down Expand Up @@ -1890,6 +1893,7 @@ def assign_fhir_for_file(file):
for case in file['cases']:
patient_id_identifier = Identifier.construct()
patient_id_identifier.value = case['Patient.id']
patient_id_identifier.use = "official"
patient_id_identifier.system = "".join(["https://gdc.cancer.gov/", "case_id"])

patient_id = utils.mint_id(identifier=patient_id_identifier, resource_type="Patient", project_id=project_id,
Expand Down
Loading

0 comments on commit 3194f22

Please sign in to comment.