From 4b3cc16ca468ff4b05de16e906306723f6f32d09 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Wed, 31 Jan 2024 13:11:09 +0100
Subject: [PATCH] perf: improve spia script and report (#83)

These changes come from implementing spia in a GFOLD-based workflow
here:

https://github.com/dlaehnemann/rna-seq-conservative-fold-change-without-replicates

This was modeled on the spia implementation here, and led to some
cleanup of the `spia.R` script, to the removal of unnecessary report
tables that didn't add any value and to some improvements of the spia
reporting table.
---
 workflow/report/spia.rst                      |  16 +-
 .../resources/datavzrd/spia-template.yaml     | 161 +++-------------
 workflow/rules/datavzrd.smk                   |  15 +-
 workflow/rules/enrichment.smk                 |   2 -
 workflow/scripts/spia.R                       | 178 +++++++++---------
 5 files changed, 137 insertions(+), 235 deletions(-)

diff --git a/workflow/report/spia.rst b/workflow/report/spia.rst
index 9ed17445..bc6fca44 100644
--- a/workflow/report/spia.rst
+++ b/workflow/report/spia.rst
@@ -1,5 +1,17 @@
 **Pathway enrichment** performed with SPIA, using the model ``{{ snakemake.config["diffexp"]["models"][snakemake.wildcards.model]["full"] }}``.
 
-The table contains the following columns (also see the `SPIA docs <https://rdrr.io/bioc/SPIA/man/spia.html>`_): 
-``pSize`` is the number of genes on the pathway; ``NDE`` is the number of DE genes per pathway; ``tA`` is the observed total perturbation accumulation in the pathway; ``pNDE`` is the probability to observe at least NDE genes on the pathway using a hypergeometric model; ``pPERT`` is the probability to observe a total accumulation more extreme than tA only by chance; ``pG`` is the p-value obtained by combining pNDE and pPERT; ``pGFdr`` and ``pGFWER`` are the False Discovery Rate and respectively Bonferroni adjusted global p-values; and the ``Status`` gives the direction in which the pathway is perturbed (``activated`` or ``inhibited``). KEGGLINK gives a web link to the KEGG website that displays the pathway image with the differentially expressed genes highlighted in red.
+The table contains the following columns that have been renamed for descriptive titles (also see the `SPIA docs <https://rdrr.io/bioc/SPIA/man/spia.html>`_; for renamed columns, original spia column names are mentioned in parentheses): 
+**Name** of the pathway;
+**number of genes on the pathway** (``pSize``);
+**number of DE genes per pathway** where DE signifies "differentially expressed" (``NDE``);
+**total perturbation accumulation** (``tA``);
+**Combined FDR** where FDR signifies "false discovery rate" (``pGFdr``);
+**Status** of the pathway, inhibited vs. activated.
 
+The following columns (available from spia output), are hidden in this table in favour of the combined FDR as an overall assessment of the reliability of a pathway's perturbation.
+You can access them per pathway by clicking on the leading ``+`` symbol of a row:
+**p-value for at least NDE genes** where NDE signifies "n differentially expressed" (``pNDE``);
+**p-value to observe a total accumulation** (``pPERT``);
+**Combined p-value** (``pG``);
+**Combined Bonferroni p-values** (``pGFWER``);
+**pathway id** provided by the pathway database used.
\ No newline at end of file
diff --git a/workflow/resources/datavzrd/spia-template.yaml b/workflow/resources/datavzrd/spia-template.yaml
index fdac5d29..2bf55e60 100644
--- a/workflow/resources/datavzrd/spia-template.yaml
+++ b/workflow/resources/datavzrd/spia-template.yaml
@@ -1,181 +1,68 @@
-name: ?f"Pathway impact analysis for model {wildcards.model}"
+name: ?f"spia pathway impact analysis for model {wildcards.model}"
 datasets:
   spia_table:
     path: ?input.spia_table
     offer-excel: true
     separator: "\t"
-  spia_table_activated:
-    path: ?input.spia_table_activated
-    offer-excel: true
-    separator: "\t"
-  spia_table_inhibited:
-    path: ?input.spia_table_inhibited
-    offer-excel: true
-    separator: "\t"
 default-view: spia_table
 views:
   spia_table:
     dataset: spia_table
     desc: |
-        The table contains the following columns pSize is the number of genes on the pathway; NDE is the number of DE genes per pathway; tA is the observed total perturbation accumulation in the pathway; pNDE is the probability to observe at least NDE genes on the pathway using a hypergeometric model; pPERT is the probability to observe a total accumulation more extreme than tA only by chance; pG is the p-value obtained by combining pNDE and pPERT; pGFdr and pGFWER are the False Discovery Rate and respectively Bonferroni adjusted global p-values; and the Status gives the direction in which the pathway is perturbed (activated or inhibited).
+      ?f"spia pathway impact analysis for model {wildcards.model}"
     page-size: 25
     render-table:
       columns:
         Name:
           display-mode: normal
           link-to-url: 
-            reactome:
-              url: "http://reactome.org/PathwayBrowser/#/{Ids}"
+            pathway:
+              ?if params.pathway_db == "reactome":
+                url: "http://reactome.org/PathwayBrowser/#/{pathway id}"
+              ?elif params.pathway_db == "panther":
+                url: "https://www.pantherdb.org/pathway/pathwayDiagram.jsp?catAccession={pathway id}"
+              # we should add all the pathway databases that bioconductor-graphite enables (see its `pathwayDatabases()` function)
+              ?else: # not sure what a good fallback would be here
+                url: "http://reactome.org/PathwayBrowser/#/{pathway id}"
         number of genes on the pathway:
           plot:
             heatmap:
               scale: linear
               range:
-                - white
-                - "#186904"        
+                - "#F7F7F7"
+                - "#B2182B"
         number of DE genes per pathway:
           plot:
             heatmap:
               scale: linear
               range:
-                - white
-                - "#186904"
+                - "#F7F7F7"
+                - "#B2182B"
         p-value for at least NDE genes:
-          display-mode: hidden
+          display-mode: detail
         total perturbation accumulation:
           plot:
             heatmap:
               scale: linear
               range:
-                - "#e6550d"
-                - "white"
-                - "#6baed6"
-              domain:
-                - -1
-                - 0
-                - 1
+                - "#B2182B"
+                - "#F7F7F7"
+                - "#2166AC"
+              domain-mid: 0
         p-value to observe a total accumulation:
-          display-mode: hidden
+          display-mode: detail
         Combined p-value:
-          display-mode: hidden
+          display-mode: detail
         Combined FDR:
           plot:
             bars:
               scale: linear
         Combined Bonferroni p-values:
-          display-mode: hidden     
+          display-mode: detail     
         Status:
           plot:
             heatmap:
               scale: ordinal
               color-scheme: accent
-        Ids:
-          display-mode: hidden
-  spia_table_activated:
-    dataset: spia_table_activated
-    desc: |
-        The table (sorted by "Status:Activated") contains the following columns pSize is the number of genes on the pathway; NDE is the number of DE genes per pathway; tA is the observed total perturbation accumulation in the pathway; pNDE is the probability to observe at least NDE genes on the pathway using a hypergeometric model; pPERT is the probability to observe a total accumulation more extreme than tA only by chance; pG is the p-value obtained by combining pNDE and pPERT; pGFdr and pGFWER are the False Discovery Rate and respectively Bonferroni adjusted global p-values; and the Status gives the direction in which the pathway is perturbed (activated).
-    page-size: 25
-    render-table:
-      columns:
-        Name:
-          display-mode: normal
-          link-to-url: 
-            reactome:
-              url: "http://reactome.org/PathwayBrowser/#/{Ids}"
-        number of genes on the pathway:
-          plot:
-            heatmap:
-              scale: linear
-              range:
-                - white
-                - "#186904"        
-        number of DE genes per pathway:
-          plot:
-            heatmap:
-              scale: linear
-              range:
-                - white
-                - "#186904"
-        p-value for at least NDE genes:
-          display-mode: hidden
-        total perturbation accumulation:
-          plot:
-            heatmap:
-              scale: linear
-              range:
-                - "#e6550d"
-                - "white"
-                - "#6baed6"
-              domain:
-                - -1
-                - 0
-                - 1
-        p-value to observe a total accumulation:
-          display-mode: hidden
-        Combined p-value:
-          display-mode: hidden
-        Combined FDR:
-          plot:
-            bars:
-              scale: linear
-        Combined Bonferroni p-values:
-          display-mode: hidden     
-        Status:
-          display-mode: normal
-        Ids:
-          display-mode: hidden 
-  spia_table_inhibited:
-    dataset: spia_table_inhibited
-    desc: |
-        The table (sorted by "Status:Inhibited") contains the following columns pSize is the number of genes on the pathway; NDE is the number of DE genes per pathway; tA is the observed total perturbation accumulation in the pathway; pNDE is the probability to observe at least NDE genes on the pathway using a hypergeometric model; pPERT is the probability to observe a total accumulation more extreme than tA only by chance; pG is the p-value obtained by combining pNDE and pPERT; pGFdr and pGFWER are the False Discovery Rate and respectively Bonferroni adjusted global p-values; and the Status gives the direction in which the pathway is perturbed (inhibited).
-    page-size: 25
-    render-table:
-      columns:
-        Name:
-          display-mode: normal
-          link-to-url: 
-            reactome:
-              url: "http://reactome.org/PathwayBrowser/#/{Ids}"
-        number of genes on the pathway:
-          plot:
-            heatmap:
-              scale: linear
-              range:
-                - white
-                - "#186904"        
-        number of DE genes per pathway:
-          plot:
-            heatmap:
-              scale: linear
-              range:
-                - white
-                - "#186904"
-        p-value for at least NDE genes:
-          display-mode: hidden
-        total perturbation accumulation:
-          plot:
-            heatmap:
-              scale: linear
-              range:
-                - "#e6550d"
-                - "white"
-                - "#6baed6"
-              domain:
-                - -1
-                - 0
-                - 1
-        p-value to observe a total accumulation:
-          display-mode: hidden
-        Combined p-value:
-          display-mode: hidden
-        Combined FDR:
-          plot:
-            bars:
-              scale: linear
-        Combined Bonferroni p-values:
-          display-mode: hidden     
-        Status:
-          display-mode: normal
-        Ids:
-          display-mode: hidden  
\ No newline at end of file
+        pathway id:
+          display-mode: detail
\ No newline at end of file
diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk
index 253bbc52..f41debf8 100644
--- a/workflow/rules/datavzrd.smk
+++ b/workflow/rules/datavzrd.smk
@@ -2,12 +2,12 @@ rule render_datavzrd_config_spia:
     input:
         template=workflow.source_path("../resources/datavzrd/spia-template.yaml"),
         spia_table="results/tables/pathways/{model}.pathways.tsv",
-        spia_table_activated="results/tables/pathways/{model}.activated-pathways.tsv",
-        spia_table_inhibited="results/tables/pathways/{model}.inhibited-pathways.tsv",
     output:
         "results/datavzrd/spia/{model}.yaml",
     log:
         "logs/yte/render-datavzrd-config-spia/{model}.log",
+    params:
+        pathway_db=config["enrichment"]["spia"]["pathway_database"],
     template_engine:
         "yte"
 
@@ -53,8 +53,6 @@ rule spia_datavzrd:
         config="results/datavzrd/spia/{model}.yaml",
         # files required for rendering the given configs
         spia_table="results/tables/pathways/{model}.pathways.tsv",
-        spia_table_activated="results/tables/pathways/{model}.activated-pathways.tsv",
-        spia_table_inhibited="results/tables/pathways/{model}.inhibited-pathways.tsv",
     output:
         report(
             directory("results/datavzrd-reports/spia-{model}"),
@@ -67,8 +65,7 @@ rule spia_datavzrd:
     log:
         "logs/datavzrd-report/spia-{model}/spia-{model}.log",
     wrapper:
-        # "v2.6.0/utils/datavzrd"
-        "v3.3.5-1-gd73914d/utils/datavzrd"
+        "v3.3.6/utils/datavzrd"
 
 
 rule diffexp_datavzrd:
@@ -93,8 +90,7 @@ rule diffexp_datavzrd:
     log:
         "logs/datavzrd-report/diffexp.{model}/diffexp.{model}.log",
     wrapper:
-        # "v2.6.0/utils/datavzrd"
-        "v3.3.5-1-gd73914d/utils/datavzrd"
+        "v3.3.6/utils/datavzrd"
 
 
 rule go_enrichment_datavzrd:
@@ -121,5 +117,4 @@ rule go_enrichment_datavzrd:
     log:
         "logs/datavzrd-report/go_enrichment-{model}/go_enrichment-{model}_{gene_fdr}.go_term_fdr_{go_term_fdr}.log",
     wrapper:
-        # "v2.6.0/utils/datavzrd"
-        "v3.3.5-1-gd73914d/utils/datavzrd"
+        "v3.3.6/utils/datavzrd"
diff --git a/workflow/rules/enrichment.smk b/workflow/rules/enrichment.smk
index 86890731..cb285c73 100644
--- a/workflow/rules/enrichment.smk
+++ b/workflow/rules/enrichment.smk
@@ -12,8 +12,6 @@ rule spia:
         spia_db="resources/spia-db.rds",
     output:
         table="results/tables/pathways/{model}.pathways.tsv",
-        table_activated="results/tables/pathways/{model}.activated-pathways.tsv",
-        table_inhibited="results/tables/pathways/{model}.inhibited-pathways.tsv",
         plots="results/plots/pathways/{model}.spia-perturbation-plots.pdf",
     params:
         bioc_species_pkg=bioc_species_pkg,
diff --git a/workflow/scripts/spia.R b/workflow/scripts/spia.R
index dbd41a5f..9289d23f 100644
--- a/workflow/scripts/spia.R
+++ b/workflow/scripts/spia.R
@@ -6,9 +6,8 @@ library("SPIA")
 library("graphite")
 library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)
 
-# provides library("tidyverse") and functions load_bioconductor_package() and
-# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
-# snakemake@params[["covariate"]]
+# provides library("tidyverse") and get_prefix_col(), where the latter requires
+# snakemake@input[["samples"]] and snakemake@params[["covariate"]]
 source(snakemake@params[["common_src"]])
 
 pw_db <- snakemake@params[["pathway_db"]]
@@ -16,91 +15,102 @@ db <- readRDS(snakemake@input[["spia_db"]])
 
 options(Ncpus = snakemake@threads)
 
-diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
-    drop_na(ens_gene) %>%
-    mutate(ens_gene = str_c("ENSEMBL:", ens_gene))
-universe <- diffexp %>% pull(var = ens_gene)
-sig_genes <- diffexp %>% filter(qval <= 0.05)
+diffexp <- read_tsv(snakemake@input[["diffexp"]]) |>
+  drop_na(ens_gene) |>
+  mutate(ens_gene = str_c("ENSEMBL:", ens_gene))
+
+universe <- diffexp |>
+  dplyr::select(ens_gene) |>
+  distinct() |>
+  pull(ens_gene)
+
+sig_genes <- diffexp |>
+  filter(qval <= 0.05)
+
+columns <- c(
+  "Name",
+  "number of genes on the pathway",
+  "number of DE genes per pathway",
+  "p-value for at least NDE genes",
+  "total perturbation accumulation",
+  "p-value to observe a total accumulation",
+  "Combined p-value",
+  "Combined FDR",
+  "Combined Bonferroni p-values",
+  "Status",
+  "pathway id"
+)
+
 if (nrow(sig_genes) == 0) {
-    cols <- c(
-        "Name", "Status", "Combined FDR",
-        "total perturbation accumulation", "number of genes on the pathway",
-        "number of DE genes per pathway", "p-value for at least NDE genes",
-        "Combined Bonferroni p-values",
-        "p-value to observe a total accumulation", "Combined p-value", "Ids"
-    )
-    res <- data.frame(matrix(ncol = 11, nrow = 0, dimnames = list(NULL, cols)))
-    # create empty perturbation plots
-    pdf(file = snakemake@output[["plots"]])
-    write_tsv(res, snakemake@output[["table"]])
-    write_tsv(res, snakemake@output[["table_activated"]])
-    write_tsv(res, snakemake@output[["table_inhibited"]])
-    dev.off()
+  # the best hack for an empty tibble from a column specification I could find
+  res <- read_csv("\n", col_names = columns)
+  write_tsv(res, snakemake@output[["table"]])
+  write_tsv(res, snakemake@output[["table_activated"]])
+  write_tsv(res, snakemake@output[["table_inhibited"]])
 } else {
-    # get logFC equivalent (the sum of beta scores of covariates of interest)
+  # get logFC equivalent (the sum of beta scores of covariates of interest)
 
-    beta_col <- get_prefix_col("b", colnames(sig_genes))
+  beta_col <- get_prefix_col("b", colnames(sig_genes))
 
-    beta <- sig_genes %>%
-        dplyr::select(ens_gene, !!beta_col) %>%
-        deframe()
+  beta <- sig_genes |>
+    dplyr::select(ens_gene, !!beta_col) |>
+    deframe()
 
-    t <- tempdir(check = TRUE)
-    olddir <- getwd()
-    setwd(t)
-    prepareSPIA(db, pw_db)
-    res <- runSPIA(
-        de = beta, all = universe, pw_db,
-        plots = TRUE, verbose = TRUE
-    )
-    setwd(olddir)
+  t <- tempdir(check = TRUE)
+  olddir <- getwd()
+  setwd(t)
+  prepareSPIA(db, pw_db)
+  res <- runSPIA(
+    de = beta,
+    all = universe,
+    pw_db,
+    plots = TRUE,
+    verbose = TRUE
+  )
+  setwd(olddir)
 
-    file.copy(
-        file.path(t, "SPIAPerturbationPlots.pdf"),
-        snakemake@output[["plots"]]
-    )
-    pathway_names <- db[res$Name]
-    pathway_names <- db[res$Name]
-    path_ids <- as.matrix(lapply(pathway_names@entries, slot, "id"))
-    if (length(path_ids) > 0) {
-        path_ids_data_frame <-
-            data.frame(Ids = matrix(unlist(path_ids),
-                nrow = length(path_ids), byrow = TRUE
-            ))
-        final_res <- cbind(res,
-            Ids = path_ids_data_frame$Ids
-        )
-        res_reorder <- dplyr::select(
-            final_res, Name, Status,
-            pGFdr, tA, pSize, NDE, pNDE, pGFWER, pPERT, pG, Ids
-        )
-        res_reorder <- res_reorder %>%
-            rename(
-                "Combined Bonferroni p-values" = "pGFWER",
-                "Combined FDR" = "pGFdr",
-                "total perturbation accumulation" = "tA",
-                "number of genes on the pathway" = "pSize",
-                "number of DE genes per pathway" = "NDE",
-                "Combined p-value" = "pG",
-                "p-value to observe a total accumulation" = "pPERT",
-                "p-value for at least NDE genes" = "pNDE"
-            )
-        write_tsv(res_reorder, snakemake@output[["table"]])
-        sort_activated <- res_reorder[res_reorder$Status == "Activated", ]
-        sort_inhibited <- res_reorder[res_reorder$Status == "Inhibited", ]
-        write_tsv(sort_activated, snakemake@output[["table_activated"]])
-        write_tsv(sort_inhibited, snakemake@output[["table_inhibited"]])
-    } else {
-        columns <- c(
-            "Name", "Status", "Combined FDR", "total perturbation accumulation",
-            "number of genes on the pathway", "number of DE genes per pathway",
-            "p-value for at least NDE genes", "Combined Bonferroni p-values",
-            "p-value to observe a total accumulation", "Combined p-value", "Ids"
+  file.copy(
+    file.path(t, "SPIAPerturbationPlots.pdf"),
+    snakemake@output[["plots"]]
+  )
+  pathway_names <- db[res$Name]
+  if (length(pathway_names) > 0) {
+    pathway_ids_tibble <- pathway_names@entries |>
+      map(slot, "id") |>
+      unlist() |>
+      as_tibble(
+        rownames="pathway_name"
+      ) |>
+      rename(
+        `pathway id` = value
+      )
+    final_res <- as_tibble(res) |>
+      left_join(
+        pathway_ids_tibble,
+        join_by(Name == pathway_name)
+      ) |>
+      rename(
+        "number of genes on the pathway" = "pSize",
+        "number of DE genes per pathway" = "NDE",
+        "p-value for at least NDE genes" = "pNDE",
+        "total perturbation accumulation" = "tA",
+        "p-value to observe a total accumulation" = "pPERT",
+        "Combined p-value" = "pG",
+        "Combined FDR" = "pGFdr",
+        "Combined Bonferroni p-values" = "pGFWER"
+      ) |>
+      dplyr::select(
+        all_of(
+            columns
         )
-        emtpy_data_frame <- data.frame(matrix(nrow = 0, ncol = length(columns)))
-        colnames(emtpy_data_frame) <- columns
-        write_tsv(emtpy_data_frame, snakemake@output[["table"]])
-        write_tsv(emtpy_data_frame, snakemake@output[["table_activated"]])
-        write_tsv(emtpy_data_frame, snakemake@output[["table_inhibited"]])
-    }
-}
+      ) |>
+      arrange(
+        desc(`total perturbation accumulation`)
+      )
+    write_tsv(final_res, snakemake@output[["table"]])
+  } else {
+    # the best hack for an empty tibble from a column specification I could find
+    emtpy_data_frame <- read_csv("\n", col_names = columns)
+    write_tsv(emtpy_data_frame, snakemake@output[["table"]])
+  }
+}
\ No newline at end of file