Merge pull request #49 from bmeg/development

Development
bmeg · Oct 11, 2024 · 3194f22 · 3194f22
2 parents 5a70a7d + 4b501d2
commit 3194f22
Show file tree

Hide file tree

Showing 10 changed files with 3,651 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 
 
 ### Project overview:
-Transforms and harmonizes data from Genomic Data Commons (GDC), Cellosaurus cell-lines, and International Cancer Genome Consortium (ICGC) repositories into 🔥 FHIR (Fast Healthcare Interoperability Resources) format.
+Transforms and harmonizes data from Genomic Data Commons (GDC), Cellosaurus cell-lines, International Cancer Genome Consortium (ICGC), and Human Tumor Atlas Network (HTAN) repositories into 🔥 FHIR (Fast Healthcare Interoperability Resources) format.
 
 - #### GDC study simplified FHIR graph 
 ![mapping](./imgs/gdc_tcga_study_example_fhir_graph.png)
@@ -75,6 +75,47 @@ Detailed step-by-step guide on FHIRizing data for a project's study can be found
   ```
    fhirizer generate --name icgc --icgc <ICGC_project_name> --has_files
   ```
+- HTAN
+
+FHIRizing HTAN depends on the: 
+1. Folder hierarchy with naming conventions as below and existance of raw data pulled from HTAN
+```
+fhirizer/
+|-- projects/
+|   └── HTAN/ 
+|         └── OHSU/
+|               |-- raw/ 
+|               |    |--  files/
+|               |    |      |-- table_data.tsv
+|               |    |      └── cds_manifest.csv
+|               |    |--  biospecimens/table_data.tsv
+|               |    └──  cases/table_data.tsv
+|               └── META/
+```
+2. existance of chembl DB file
+```
+fhirizer/
+|-- resources/
+      └── chembl_resources/chembl_34.db
+
+```
+
+Example run: 
+
+for all available atlases under ./projects/HTAN/<Atlas name>
+  ```
+   fhirizer generate --name htan 
+  ```
+or for one or more: 
+```commandline
+fhirizer generate --name htan --atlas "OHSU,DFCI,WUSTL,BU,CHOP"
+```
+
+G3T validate FHIRized ndjson files: 
+```commandline
+for i in $(ls projects/HTAN); do echo $i && g3t meta validate projects/HTAN/$i/META; done
+```
+
 ### Constructing GDC maps cli cmds 
 
 initialize initial structure of project, case, or file to add Maps
@@ -145,9 +186,20 @@ fhirizer/
 |   |           |-- filess.ndjson
 |   |           └── META/
 |   └── ICGC/
-|         └── ICGC-STUDY/ 
-|                |-- data/
-|                └── META/
+|   |     └── ICGC-STUDY/ 
+|   |            |-- data/
+|   |            └── META/
+|   └── HTAN/ 
+|         └── OHSU/
+|               |-- raw/ 
+|               |    |--  files/
+|               |    |      |-- table_data.tsv
+|               |    |      └── cds_manifest.csv
+|               |    |--  biospecimens/table_data.tsv
+|               |    └──  cases/table_data.tsv
+|               └── META/
+|              
+|              
 |--README.md
 └── setup.py
 ```
diff --git a/fhirizer/cli.py b/fhirizer/cli.py
@@ -1,5 +1,4 @@
-from fhirizer import utils, mapping, entity2fhir
-from fhirizer import icgc2fhir
+from fhirizer import utils, mapping, entity2fhir, icgc2fhir, htan2fhir
 import click
 from pathlib import Path
 
@@ -8,28 +7,31 @@ class NotRequiredIf(click.Option):
     def __init__(self, *args, **kwargs):
         self.not_required_if = kwargs.pop('not_required_if')
         assert self.not_required_if, "'not_required_if' parameter required"
+        if isinstance(self.not_required_if, str):
+            self.not_required_if = [self.not_required_if]
         kwargs['help'] = (kwargs.get('help', '') +
                           ' NOTE: This argument is mutually exclusive with %s' %
-                          self.not_required_if
+                          ', '.join(self.not_required_if)
                           ).strip()
         super(NotRequiredIf, self).__init__(*args, **kwargs)
 
     def handle_parse_result(self, ctx, opts, args):
         we_are_present = self.name in opts
-        other_present = self.not_required_if in opts
+        others_present = [opt for opt in self.not_required_if if opt in opts]
 
-        if other_present:
+        if others_present:
             if we_are_present:
                 raise click.UsageError(
                     "Illegal usage: `%s` is mutually exclusive with `%s`" % (
-                        self.name, self.not_required_if))
+                        self.name, ', '.join(others_present)))
             else:
                 self.prompt = None
 
         return super(NotRequiredIf, self).handle_parse_result(
             ctx, opts, args)
 
 
+
 @click.group()
 def cli():
     """GDC, Cellosaurus, ICGC to FHIR schema Key and Content Mapping"""
@@ -141,22 +143,29 @@ def convert(name, in_path, out_path, verbose):
               show_default=True,
               help='entity name to map - project, case, file of GDC or cellosaurus')
 @click.option('--out_dir', cls=NotRequiredIf,
-              not_required_if='icgc',
+              not_required_if=['htan', 'icgc'],
               help='Directory path to save mapped FHIR ndjson files.')
 @click.option('--entity_path', cls=NotRequiredIf,
-              not_required_if='icgc',
-              help='Path to GDC entity with mapped FHIR like keys (converted file via convert). '
-                   'or Cellosaurus ndjson file of human cell-lines of interest')
+              not_required_if=['htan', 'icgc'],
+              help='Path to GDC entity with mapped FHIR like keys (converted file via convert) or Cellosaurus ndjson '
+                   'file of human cell-lines of interest.')
+@click.option('--atlas', required=False,
+              default=['OHSU'],
+              show_default=True,
+              help='List of atlas project(s) name to FHIRize. ex. ["OHSU", "DFCI", "WUSTL", "BU", "CHOP", "Duke", "HMS", "HTAPP", "MSK", "Stanford"]')
 @click.option('--icgc', help='Name of the ICGC project to FHIRize.')
 @click.option('--has_files', is_flag=True, help='Boolean indicating file metatda via new argo site is available @ '
                                                 'ICGC/{project}/data directory to FHIRize.')
 @click.option('--convert', is_flag=True, help='Boolean indicating to write converted keys to directory')
 @click.option('--verbose', is_flag=True)
-def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose):
-    name_list = ['project', 'case', 'file', 'cellosaurus', 'icgc']
-    assert name in ['project', 'case', 'file', 'cellosaurus', 'icgc'], f'--name is not in {name_list}.'
-    assert Path(out_dir).is_dir(), f"Path {out_dir} is not a valid directory path."
-    assert Path(entity_path).is_file(), f"Path {entity_path} is not a valid file path."
+def generate(name, out_dir, entity_path, icgc, has_files, atlas, convert, verbose):
+    name_list = ['project', 'case', 'file', 'cellosaurus', 'icgc', 'htan']
+    assert name in name_list, f'--name is not in {name_list}.'
+    if name != 'htan':
+        assert Path(out_dir).is_dir(), f"Path {out_dir} is not a valid directory path."
+        assert Path(entity_path).is_file(), f"Path {entity_path} is not a valid file path."
+    else:
+        assert Path("./projects/HTAN").is_dir()
 
     if name in 'project':
         entity2fhir.project_gdc_to_fhir_ndjson(out_dir=out_dir, projects_path=entity_path, convert=convert, verbose=verbose)
@@ -168,6 +177,15 @@ def generate(name, out_dir, entity_path, icgc, has_files, convert, verbose):
         entity2fhir.cellosaurus2fhir(out_dir=out_dir, path=entity_path)
     if name in 'icgc' and icgc:
         icgc2fhir.icgc2fhir(project_name=icgc, has_files=has_files)
+    if name in 'htan':
+        if isinstance(atlas, str):
+            if "," in atlas:
+                atlas = atlas.split(",")
+                atlas = [a.strip() for a in atlas]
+            else:
+                atlas = [atlas]
+
+        htan2fhir.htan2fhir(entity_atlas_name=atlas, verbose=verbose)
 
 
 if __name__ == '__main__':

diff --git a/fhirizer/entity2fhir.py b/fhirizer/entity2fhir.py
@@ -84,7 +84,7 @@ def assign_fhir_for_project(project, disease_types=disease_types):
         pr_ident = Identifier(**{"system": "".join(["https://gdc.cancer.gov/", "program_id"]),
                                  "value": project['ResearchStudy']['ResearchStudy.id']})
         pl.append(pr_ident)
-
+        rs.identifier = [pr_ident]
         rs.id = utils.mint_id(
             identifier=pr_ident,
             resource_type="ResearchStudy",
@@ -94,6 +94,7 @@ def assign_fhir_for_project(project, disease_types=disease_types):
     else:
         p_ident = Identifier(**{"system": "".join(["https://gdc.cancer.gov/", "project_id"]),
                                 "value": project['ResearchStudy.id']})
+        rs.identifier = [p_ident]
         rs.id = utils.mint_id(
             identifier=p_ident,
             resource_type="ResearchStudy",
@@ -157,6 +158,7 @@ def assign_fhir_for_project(project, disease_types=disease_types):
 
     ref = Reference(**{"reference": "/".join(["ResearchStudy", rs_parent.id])})
     rs.partOf = [ref]
+
     #  condition -- subject --> patient <--subject-- researchsubject -- study --> researchstudy -- partOf --> researchstudy
 
     return {'ResearchStudy': rs.json(), "ResearchStudy.partOf": rs_parent.json(), 'ResearchStudy_obj': rs,
@@ -394,6 +396,7 @@ def assign_fhir_for_case(case, disease_types=disease_types, primary_sites=primar
     research_subject.status = "active"
     research_subject.study = study_ref
     research_subject.subject = subject_ref
+    research_subject.identifier = [patient_id_identifier]
     research_subject.id = utils.mint_id(
         identifier=patient_id_identifier,
         resource_type="ResearchSubject",
@@ -1890,6 +1893,7 @@ def assign_fhir_for_file(file):
         for case in file['cases']:
             patient_id_identifier = Identifier.construct()
             patient_id_identifier.value = case['Patient.id']
+            patient_id_identifier.use = "official"
             patient_id_identifier.system = "".join(["https://gdc.cancer.gov/", "case_id"])
 
             patient_id = utils.mint_id(identifier=patient_id_identifier, resource_type="Patient", project_id=project_id,