PR 10 of n - Molecular Subtyping - HGG (Joining cleaned data) (AlexsL…

…emonade#435) * Make wording consistent, rerun * Add notebook for combining CN, mutation data * Add notebook to shell script * Rename to reflect change in purpose * WIP: adding RNA data * Fleshed out HGG subtyping notebook * Fix notebook name in shell script * Change broad and short histology columns * HGG -> DMG * Rerun subsetting step Changes appear to be related to removal of some samples from the stranded dataset between v12 and v13 * Response to @jharenza comments * Remove outdated output file * Update README for 07 * Use older version of annotated CNVkit file * Handled fusions consistent with AlexsLemonade#478 * Rerun with v13 data * Account for broad histology columns * Rerun whole pipeline * Use consensus CN files; rerun * Was using the CI files last time * Run copy number notebook with consensus calls * Fix filtering step HGG -> HGAT * Including the consensus CN calls was a little more nuanced * Rerun final notebook with upstream changes * Make sure notebook text is up to date * Update CN files in modules at a glance * Add other two K28M genes * Additional 2 genes in cleaned mutation * Response to @jharenza comments * Update documentation * Additional genes did not make it to documentation * Fix typo in subtype lable * Update analyses/molecular-subtyping-HGG/07-HGG-molecular-subtyping-combine-table.Rmd Co-Authored-By: Candace Savonen <cansav09@gmail.com> * Response to @cansavvy comments Co-authored-by: Candace Savonen <cansav09@gmail.com>
UCSC-Treehouse · Feb 3, 2020 · 0edfe5f · 0edfe5f
1 parent 76612dc
commit 0edfe5f
Show file tree

Hide file tree

Showing 26 changed files with 7,200 additions and 2,414 deletions.
diff --git a/analyses/molecular-subtyping-HGG/01-HGG-molecular-subtyping-defining-lesions.Rmd b/analyses/molecular-subtyping-HGG/01-HGG-molecular-subtyping-defining-lesions.Rmd
@@ -16,7 +16,9 @@ molecular subtyping high-grade glioma samples in the OpenPBTA dataset.
 This notebook is intended to be run via the command line from the top directory
 of the repository as follows:
 
-`Rscript -e "rmarkdown::render('analyses/molecular-subtyping-HGG/01-HGG-molecular-subtyping-defining-lesions.Rmd', clean = TRUE)"`
+```
+Rscript -e "rmarkdown::render('analyses/molecular-subtyping-HGG/01-HGG-molecular-subtyping-defining-lesions.Rmd', clean = TRUE)"
+```
 
 # Set Up
 
@@ -52,6 +54,7 @@ select_metadata <- metadata %>%
   dplyr::select(Kids_First_Participant_ID,
                 sample_id,
                 Kids_First_Biospecimen_ID,
+                broad_histology,
                 short_histology,
                 disease_type_new)
 
@@ -69,7 +72,8 @@ snv_df <-
 ```{r}
 # Filter the snv consensus mutatation data for the target lesions
 snv_lesions_df <- snv_df %>%
-  dplyr::filter(Hugo_Symbol %in% c("H3F3A", "HIST1H3B") &
+  dplyr::filter(Hugo_Symbol %in% c("H3F3A", "HIST1H3B",
+                                   "HIST1H3C", "HIST2H3C") &
                   HGVSp_Short %in% c("p.K28M", "p.G35R",
                                      "p.G35V")) %>%
   dplyr::select(Tumor_Sample_Barcode, Hugo_Symbol, HGVSp_Short) %>%
@@ -79,8 +83,13 @@ snv_lesions_df <- snv_df %>%
                                   TRUE ~ "No"),
     HIST1H3B.K28M = dplyr::case_when(
       Hugo_Symbol == "HIST1H3B" & HGVSp_Short == "p.K28M" ~ "Yes",
-      TRUE ~ "No"
-    ),
+      TRUE ~ "No"),
+    HIST1H3C.K28M = dplyr::case_when(
+      Hugo_Symbol == "HIST1H3C" & HGVSp_Short == "p.K28M" ~ "Yes",
+      TRUE ~ "No"),
+    HIST2H3C.K28M = dplyr::case_when(
+      Hugo_Symbol == "HIST2H3C" & HGVSp_Short == "p.K28M" ~ "Yes",
+      TRUE ~ "No"),
     H3F3A.G35R = dplyr::case_when(Hugo_Symbol == "H3F3A" &
                                     HGVSp_Short == "p.G35R" ~ "Yes",
                                   TRUE ~ "No"),
@@ -111,18 +120,36 @@ snv_lesions_df <- select_metadata %>%
   dplyr::select(
     dplyr::ends_with("ID"),
     dplyr::starts_with("H"),
+    broad_histology,
     short_histology,
     disease_type_new
   ) %>%
   dplyr::mutate(
     disease_type_reclassified = dplyr::case_when(
-      H3F3A.K28M == "Yes" ~ "High-grade glioma, H3 K28 mutant",
-        HIST1H3B.K28M == "Yes" ~ "High-grade glioma, H3 K28 mutant",
-        H3F3A.G35R == "Yes" ~ "High-grade glioma, H3 G35 mutant",
-        H3F3A.G35V == "Yes" ~ "High-grade glioma, H3 G35 mutant",
-      TRUE ~ as.character(disease_type_new)
+      H3F3A.K28M == "Yes" ~ "Diffuse midline glioma, H3 K28 mutant",
+      HIST1H3B.K28M == "Yes" ~ "Diffuse midline glioma, H3 K28 mutant",
+      HIST1H3C.K28M == "Yes" ~ "Diffuse midline glioma, H3 K28 mutant",
+      HIST2H3C.K28M == "Yes" ~ "Diffuse midline glioma, H3 K28 mutant",
+      H3F3A.G35R == "Yes" ~ "High-grade glioma, H3 G35 mutant",
+      H3F3A.G35V == "Yes" ~ "High-grade glioma, H3 G35 mutant",
+      TRUE ~ as.character(disease_type_new)),
+    short_histology_reclassified = dplyr::case_when(
+      H3F3A.K28M == "Yes" ~ "HGAT",
+      HIST1H3B.K28M == "Yes" ~ "HGAT",
+      HIST1H3C.K28M == "Yes" ~ "HGAT",
+      HIST2H3C.K28M == "Yes" ~ "HGAT",
+      H3F3A.G35R == "Yes" ~ "HGAT",
+      H3F3A.G35V == "Yes" ~ "HGAT",
+      TRUE ~ as.character(short_histology)),
+    broad_histology_reclassified = dplyr::case_when(
+      H3F3A.K28M == "Yes" ~ "Diffuse astrocytic and oligodendroglial tumor",
+      HIST1H3B.K28M == "Yes" ~ "Diffuse astrocytic and oligodendroglial tumor",
+      HIST1H3C.K28M == "Yes" ~ "Diffuse astrocytic and oligodendroglial tumor",
+      HIST2H3C.K28M == "Yes" ~ "Diffuse astrocytic and oligodendroglial tumor",
+      H3F3A.G35R == "Yes" ~ "Diffuse astrocytic and oligodendroglial tumor",
+      H3F3A.G35V == "Yes" ~ "Diffuse astrocytic and oligodendroglial tumor",
+      TRUE ~ as.character(broad_histology)),
     )
-  )
 
 # Display `snv_lesions_df`
 snv_lesions_df 
@@ -143,7 +170,7 @@ readr::write_tsv(snv_lesions_df,
 # as HGG or DIPG
 snv_lesions_df %>%
   dplyr::filter(
-    grepl("High-grade glioma", disease_type_reclassified) &
+    grepl("High-grade glioma|Diffuse midline glioma", disease_type_reclassified) &
       !(disease_type_new %in% c("High-grade glioma", 
                                 "Brainstem glioma- Diffuse intrinsic pontine glioma"))
   )

diff --git a/analyses/molecular-subtyping-HGG/01-HGG-molecular-subtyping-defining-lesions.nb.html b/analyses/molecular-subtyping-HGG/01-HGG-molecular-subtyping-defining-lesions.nb.html
diff --git a/analyses/molecular-subtyping-HGG/02-HGG-molecular-subtyping-subset-files.R b/analyses/molecular-subtyping-HGG/02-HGG-molecular-subtyping-subset-files.R
@@ -22,6 +22,8 @@
 # Use this as the root directory to ensure proper sourcing of functions no
 # matter where this is called from
 root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+data_dir <- file.path(root_dir, "data")
+scratch_dir <- file.path(root_dir, "scratch")
 
 # Set path to subset directory
 subset_dir <-
@@ -45,49 +47,43 @@ select_metadata <- metadata %>%
 stranded_expression <-
   readr::read_rds(
     file.path(
-      root_dir,
-      "data",
+      data_dir,
       "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"
     )
   )
 
 polya_expression <-
   readr::read_rds(
     file.path(
-      root_dir,
-      "data",
+      data_dir,
       "pbta-gene-expression-rsem-fpkm-collapsed.polya.rds"
     )
   )
 
 
 # Read in focal CN data
-## TODO: This section will be updated to read in focal CN data derived from
-##       copy number consensus calls.
+## TODO: If annotated files get included in data download
 cn_df <- readr::read_tsv(
-  file.path(
-    root_dir,
-    "analyses",
-    "focal-cn-file-preparation",
-    "results",
-    "cnvkit_annotated_cn_autosomes.tsv.gz"
-  )
+  file.path(root_dir, "analyses", "focal-cn-file-preparation", "results",
+            "consensus_seg_annotated_cn_autosomes.tsv.gz")
 )
 
 # Read in fusion data
 fusion_df <- readr::read_tsv(
-  file.path(root_dir, "data", "pbta-fusion-putative-oncogenic.tsv"))
+  file.path(data_dir, "pbta-fusion-putative-oncogenic.tsv"))
 
 # Read in GISTIC `broad_values_by_arm.txt` file
-gistic_df <-
-  data.table::fread(unzip(
-    file.path(root_dir, "data", "pbta-cnv-cnvkit-gistic.zip"),
-    files = file.path(
-      "2019-12-10-gistic-results-cnvkit",
-      "broad_values_by_arm.txt"
-    ),
-    exdir = file.path(root_dir, "scratch")
-  ), data.table = FALSE)
+# TODO: update once the consensus GISTIC results are in the data release
+download.file(url = "https://github.com/AlexsLemonade/OpenPBTA-analysis/files/4123481/2019-01-28-consensus-cnv.zip",
+              destfile = file.path(scratch_dir, "2019-01-28-consensus-cnv.zip"),
+              quiet = TRUE)
+unzip(file.path(scratch_dir, "2019-01-28-consensus-cnv.zip"),
+      exdir = file.path(scratch_dir, "2019-01-28-consensus-cnv"),
+      files = file.path("2019-01-28-consensus-cnv", "broad_values_by_arm.txt"))
+gistic_df <- data.table::fread(file.path(scratch_dir,
+                                         "2019-01-28-consensus-cnv",
+                                         "broad_values_by_arm.txt"),
+                               data.table = FALSE)
 
 # Read in snv consensus mutation data
 snv_maf_df <-
@@ -127,11 +123,11 @@ hgg_lesions_df <- hgg_lesions_df %>%
 
 #### Filter metadata -----------------------------------------------------------
 
-# Filter metadata for `High-grade glioma` and samples that should be classified
+# Filter metadata for HGAT and samples that should be classified
 # as High-grade glioma based on defining lesions
 hgg_metadata_df <- metadata %>%
   dplyr::filter(
-    disease_type_new == "High-grade glioma" |
+    short_histology == "HGAT" |
       sample_id %in% hgg_lesions_df$sample_id,
     sample_type == "Tumor",
     composition == "Solid Tissue"
@@ -186,7 +182,7 @@ cn_metadata <- cn_df %>%
                 cytoband) %>%
   dplyr::filter(biospecimen_id %in% hgg_metadata_df$Kids_First_Biospecimen_ID) %>%
   dplyr::distinct() # Remove duplicate rows produced as a result of not
-                    # including the copy number variable from `cn_df`
+# including the copy number variable from `cn_df`
 
 # Write to file
 readr::write_tsv(cn_metadata, file.path(subset_dir, "hgg_focal_cn.tsv.gz"))

diff --git a/analyses/molecular-subtyping-HGG/03-HGG-molecular-subtyping-cnv.Rmd b/analyses/molecular-subtyping-HGG/03-HGG-molecular-subtyping-cnv.Rmd
@@ -3,20 +3,15 @@ title: "High-grade Glioma Molecular Subtyping - Focal and Broad Copy Number Alte
 author: "Chante Bethell, Stephanie J. Spielman, and Jaclyn Taroni for ALSF CCDL"
 date: "2020"
 output:
-  html_document:
-    df_print: paged
-    toc: yes
   html_notebook:
     toc: yes
     toc_float: yes
 ---
 
 This notebook prepares focal and broad copy number alteration data for the 
-purposes of use for subtyping HGG samples 
+purpose of subtyping HGG samples 
 ([`AlexsLemonade/OpenPBTA-analysis#249`](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/249)).
 
-#### TODO: This analysis should be revisited when consensus copy data is made available.
-
 ## Usage
 
 This notebook is intended to be run via the command line from the top directory
@@ -48,6 +43,74 @@ if (!dir.exists(results_dir)) {
 
 ### Read in Files
 
+We will need the clinical/histologies file and the defining lesions file to distinguish between missing samples or samples that had only neutral copy number calls.
+
+```{r message=FALSE}
+histologies_df <- read_tsv(file.path(root_dir, "data",
+                                     "pbta-histologies.tsv"))
+defining_lesions_df <- read_tsv(file.path(results_dir, 
+                                          "HGG_defining_lesions.tsv"))
+```
+
+#### Inclusion Criteria
+
+Here, we'll use the same logic as `02-HGG-molecular-subtyping-subset-files.R` to identify samples to include and also check for experimental strategy.
+
+```{r}
+hgg_lesions_df <- defining_lesions_df %>%
+  dplyr::filter(
+    short_histology == "HGAT" |
+      grepl("H3 G35 mutant|H3 K28 mutant", disease_type_reclassified)
+  )
+
+# Filter metadata for HGAT and samples that should be classified
+# as High-grade glioma based on defining lesions
+hgg_metadata_df <- histologies_df %>%
+  dplyr::filter(
+    short_histology == "HGAT" |
+      sample_id %in% hgg_lesions_df$sample_id,
+    sample_type == "Tumor",
+    composition == "Solid Tissue"
+  )
+```
+
+Only WGS samples have CNV and SV data, so we'll filter based on `experimental_strategy`.
+
+```{r}
+included_wgs_samples <- intersect(hgg_lesions_df %>% 
+                                    pull(Kids_First_Biospecimen_ID),
+                                  hgg_metadata_df %>% 
+                                    filter(experimental_strategy == "WGS") %>%
+                                    pull(Kids_First_Biospecimen_ID))
+```
+
+Are all the WGS samples that pass the inclusion criteria in the consensus SEG file committed to the repository?
+
+```{r}
+consensus_seg_ids <- read_tsv(file.path(root_dir, 
+                                        "analyses", 
+                                        "copy_number_consensus_call",
+                                        "results",
+                                        "pbta-cnv-consensus.seg.gz")) %>%
+  pull(ID)
+consensus_seg_ids <- unique(consensus_seg_ids)
+```
+
+```{r}
+all(included_wgs_samples %in% consensus_seg_ids)
+```
+
+No, they're not.
+How many are missing?
+
+```{r}
+sum(!(included_wgs_samples %in% consensus_seg_ids))
+```
+
+We'd then expect `r sum(included_wgs_samples %in% consensus_seg_ids)` samples to have non-NA values in the cleaned CNV files.
+
+#### Read in CN Data
+
 ```{r}
 # Read in HGG subset focal CN data
 cn_df <- read_tsv(file.path(input_dir, "hgg_focal_cn.tsv.gz"))
@@ -61,15 +124,10 @@ gistic_df <- read_tsv(file.path(input_dir, "hgg_gistic_broad_values.tsv"))
 output_file <- file.path(results_dir, "HGG_cleaned_cnv.tsv")
 ```
 
-## CNVkit Data
-
-We are using CNVkit data that has been annotated via the [`focal-cn-file-preparation`](../focal-cn-file-preparation/) module.
-Note that a gene may have evidence for two kinds of copy number alteration (e.g., both _loss_ and _gain_) in these data.
-This may or may not be a caller-specific artifact that will be alleviated by generating consensus copy number calls 
-([`AlexsLemonade/OpenPBTA-analysis#128`](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/128)).
-See also: https://jaclyn-taroni.github.io/openpbta-notebook-concept/both-gain-and-loss.nb.html.
-
+## Annotated consensus CN data
 
+We are using consensus copy number data data that has been annotated via the [`focal-cn-file-preparation`](../focal-cn-file-preparation/) module.
+Note that a gene may have evidence for two kinds of copy number alteration (e.g., both _loss_ and _gain_) in these data, but no longer see instances of this when using the consensus data for these samples.
 
 ```{r}
 # Select only the ID information, for use in joining below
@@ -107,7 +165,6 @@ cn_df_genes_status <- cn_df_genes %>%
 head(cn_df_genes_status, n = 10)
 ```
 
-
 ## GISTIC Data
 
 The GISTIC `broad_values_by_arm.txt` file is a matrix with integers that indicate chromosome arm status. 
@@ -126,24 +183,37 @@ gistic_status_df <- gistic_df %>%
               . < 0 ~ "loss",
               . > 0 ~ "gain",
               . == 0 ~ "neutral"
-            ))) %>%
-  # drop sample_id
-  select(-sample_id)
+            )))
 
 head(gistic_status_df, n = 10)
 ```
 
+Add in participant IDs.
+
+```{r}
+gistic_status_df <- hgg_metadata_df %>% 
+  select(Kids_First_Biospecimen_ID,
+         Kids_First_Participant_ID,
+         sample_id) %>%
+  right_join(gistic_status_df)
+```
+
 ## Join Together
 
 ```{r}
-# There are some samples that had too many segments and were flagged by GISTIC
-# These values will be filled with NA -- we'll change them to indicate that 
-# they failed GISTIC QC
-final_hgg_cn_df <- left_join(cn_df_genes_status,
-                             gistic_status_df,
-                             by = "Kids_First_Biospecimen_ID") %>%
-  replace(is.na(.), "Failed GISTIC QC")
-head(final_hgg_cn_df, n = 10)
+final_hgg_cn_df <- full_join(cn_df_genes_status,
+                             gistic_status_df)
+
+nrow(final_hgg_cn_df)
+```
+
+`final_hgg_cn_df` includes all the samples that are in the consensus SEG file.
+So any missing CNA values at this stage come from the fact that neutral values are _not_ included in the annotated copy number files.
+We can fill any "missing values" with neutral calls as a result.
+
+```{r}
+final_hgg_cn_df <- final_hgg_cn_df %>%
+  replace(is.na(.), "neutral")
 ```
 
 Write this to file.