From 4b3cc16ca468ff4b05de16e906306723f6f32d09 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Wed, 31 Jan 2024 13:11:09 +0100 Subject: [PATCH] perf: improve spia script and report (#83) These changes come from implementing spia in a GFOLD-based workflow here: https://github.com/dlaehnemann/rna-seq-conservative-fold-change-without-replicates This was modeled on the spia implementation here, and led to some cleanup of the `spia.R` script, to the removal of unnecessary report tables that didn't add any value and to some improvements of the spia reporting table. --- workflow/report/spia.rst | 16 +- .../resources/datavzrd/spia-template.yaml | 161 +++------------- workflow/rules/datavzrd.smk | 15 +- workflow/rules/enrichment.smk | 2 - workflow/scripts/spia.R | 178 +++++++++--------- 5 files changed, 137 insertions(+), 235 deletions(-) diff --git a/workflow/report/spia.rst b/workflow/report/spia.rst index 9ed17445..bc6fca44 100644 --- a/workflow/report/spia.rst +++ b/workflow/report/spia.rst @@ -1,5 +1,17 @@ **Pathway enrichment** performed with SPIA, using the model ``{{ snakemake.config["diffexp"]["models"][snakemake.wildcards.model]["full"] }}``. -The table contains the following columns (also see the `SPIA docs `_): -``pSize`` is the number of genes on the pathway; ``NDE`` is the number of DE genes per pathway; ``tA`` is the observed total perturbation accumulation in the pathway; ``pNDE`` is the probability to observe at least NDE genes on the pathway using a hypergeometric model; ``pPERT`` is the probability to observe a total accumulation more extreme than tA only by chance; ``pG`` is the p-value obtained by combining pNDE and pPERT; ``pGFdr`` and ``pGFWER`` are the False Discovery Rate and respectively Bonferroni adjusted global p-values; and the ``Status`` gives the direction in which the pathway is perturbed (``activated`` or ``inhibited``). KEGGLINK gives a web link to the KEGG website that displays the pathway image with the differentially expressed genes highlighted in red. +The table contains the following columns that have been renamed for descriptive titles (also see the `SPIA docs `_; for renamed columns, original spia column names are mentioned in parentheses): +**Name** of the pathway; +**number of genes on the pathway** (``pSize``); +**number of DE genes per pathway** where DE signifies "differentially expressed" (``NDE``); +**total perturbation accumulation** (``tA``); +**Combined FDR** where FDR signifies "false discovery rate" (``pGFdr``); +**Status** of the pathway, inhibited vs. activated. +The following columns (available from spia output), are hidden in this table in favour of the combined FDR as an overall assessment of the reliability of a pathway's perturbation. +You can access them per pathway by clicking on the leading ``+`` symbol of a row: +**p-value for at least NDE genes** where NDE signifies "n differentially expressed" (``pNDE``); +**p-value to observe a total accumulation** (``pPERT``); +**Combined p-value** (``pG``); +**Combined Bonferroni p-values** (``pGFWER``); +**pathway id** provided by the pathway database used. \ No newline at end of file diff --git a/workflow/resources/datavzrd/spia-template.yaml b/workflow/resources/datavzrd/spia-template.yaml index fdac5d29..2bf55e60 100644 --- a/workflow/resources/datavzrd/spia-template.yaml +++ b/workflow/resources/datavzrd/spia-template.yaml @@ -1,181 +1,68 @@ -name: ?f"Pathway impact analysis for model {wildcards.model}" +name: ?f"spia pathway impact analysis for model {wildcards.model}" datasets: spia_table: path: ?input.spia_table offer-excel: true separator: "\t" - spia_table_activated: - path: ?input.spia_table_activated - offer-excel: true - separator: "\t" - spia_table_inhibited: - path: ?input.spia_table_inhibited - offer-excel: true - separator: "\t" default-view: spia_table views: spia_table: dataset: spia_table desc: | - The table contains the following columns pSize is the number of genes on the pathway; NDE is the number of DE genes per pathway; tA is the observed total perturbation accumulation in the pathway; pNDE is the probability to observe at least NDE genes on the pathway using a hypergeometric model; pPERT is the probability to observe a total accumulation more extreme than tA only by chance; pG is the p-value obtained by combining pNDE and pPERT; pGFdr and pGFWER are the False Discovery Rate and respectively Bonferroni adjusted global p-values; and the Status gives the direction in which the pathway is perturbed (activated or inhibited). + ?f"spia pathway impact analysis for model {wildcards.model}" page-size: 25 render-table: columns: Name: display-mode: normal link-to-url: - reactome: - url: "http://reactome.org/PathwayBrowser/#/{Ids}" + pathway: + ?if params.pathway_db == "reactome": + url: "http://reactome.org/PathwayBrowser/#/{pathway id}" + ?elif params.pathway_db == "panther": + url: "https://www.pantherdb.org/pathway/pathwayDiagram.jsp?catAccession={pathway id}" + # we should add all the pathway databases that bioconductor-graphite enables (see its `pathwayDatabases()` function) + ?else: # not sure what a good fallback would be here + url: "http://reactome.org/PathwayBrowser/#/{pathway id}" number of genes on the pathway: plot: heatmap: scale: linear range: - - white - - "#186904" + - "#F7F7F7" + - "#B2182B" number of DE genes per pathway: plot: heatmap: scale: linear range: - - white - - "#186904" + - "#F7F7F7" + - "#B2182B" p-value for at least NDE genes: - display-mode: hidden + display-mode: detail total perturbation accumulation: plot: heatmap: scale: linear range: - - "#e6550d" - - "white" - - "#6baed6" - domain: - - -1 - - 0 - - 1 + - "#B2182B" + - "#F7F7F7" + - "#2166AC" + domain-mid: 0 p-value to observe a total accumulation: - display-mode: hidden + display-mode: detail Combined p-value: - display-mode: hidden + display-mode: detail Combined FDR: plot: bars: scale: linear Combined Bonferroni p-values: - display-mode: hidden + display-mode: detail Status: plot: heatmap: scale: ordinal color-scheme: accent - Ids: - display-mode: hidden - spia_table_activated: - dataset: spia_table_activated - desc: | - The table (sorted by "Status:Activated") contains the following columns pSize is the number of genes on the pathway; NDE is the number of DE genes per pathway; tA is the observed total perturbation accumulation in the pathway; pNDE is the probability to observe at least NDE genes on the pathway using a hypergeometric model; pPERT is the probability to observe a total accumulation more extreme than tA only by chance; pG is the p-value obtained by combining pNDE and pPERT; pGFdr and pGFWER are the False Discovery Rate and respectively Bonferroni adjusted global p-values; and the Status gives the direction in which the pathway is perturbed (activated). - page-size: 25 - render-table: - columns: - Name: - display-mode: normal - link-to-url: - reactome: - url: "http://reactome.org/PathwayBrowser/#/{Ids}" - number of genes on the pathway: - plot: - heatmap: - scale: linear - range: - - white - - "#186904" - number of DE genes per pathway: - plot: - heatmap: - scale: linear - range: - - white - - "#186904" - p-value for at least NDE genes: - display-mode: hidden - total perturbation accumulation: - plot: - heatmap: - scale: linear - range: - - "#e6550d" - - "white" - - "#6baed6" - domain: - - -1 - - 0 - - 1 - p-value to observe a total accumulation: - display-mode: hidden - Combined p-value: - display-mode: hidden - Combined FDR: - plot: - bars: - scale: linear - Combined Bonferroni p-values: - display-mode: hidden - Status: - display-mode: normal - Ids: - display-mode: hidden - spia_table_inhibited: - dataset: spia_table_inhibited - desc: | - The table (sorted by "Status:Inhibited") contains the following columns pSize is the number of genes on the pathway; NDE is the number of DE genes per pathway; tA is the observed total perturbation accumulation in the pathway; pNDE is the probability to observe at least NDE genes on the pathway using a hypergeometric model; pPERT is the probability to observe a total accumulation more extreme than tA only by chance; pG is the p-value obtained by combining pNDE and pPERT; pGFdr and pGFWER are the False Discovery Rate and respectively Bonferroni adjusted global p-values; and the Status gives the direction in which the pathway is perturbed (inhibited). - page-size: 25 - render-table: - columns: - Name: - display-mode: normal - link-to-url: - reactome: - url: "http://reactome.org/PathwayBrowser/#/{Ids}" - number of genes on the pathway: - plot: - heatmap: - scale: linear - range: - - white - - "#186904" - number of DE genes per pathway: - plot: - heatmap: - scale: linear - range: - - white - - "#186904" - p-value for at least NDE genes: - display-mode: hidden - total perturbation accumulation: - plot: - heatmap: - scale: linear - range: - - "#e6550d" - - "white" - - "#6baed6" - domain: - - -1 - - 0 - - 1 - p-value to observe a total accumulation: - display-mode: hidden - Combined p-value: - display-mode: hidden - Combined FDR: - plot: - bars: - scale: linear - Combined Bonferroni p-values: - display-mode: hidden - Status: - display-mode: normal - Ids: - display-mode: hidden \ No newline at end of file + pathway id: + display-mode: detail \ No newline at end of file diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk index 253bbc52..f41debf8 100644 --- a/workflow/rules/datavzrd.smk +++ b/workflow/rules/datavzrd.smk @@ -2,12 +2,12 @@ rule render_datavzrd_config_spia: input: template=workflow.source_path("../resources/datavzrd/spia-template.yaml"), spia_table="results/tables/pathways/{model}.pathways.tsv", - spia_table_activated="results/tables/pathways/{model}.activated-pathways.tsv", - spia_table_inhibited="results/tables/pathways/{model}.inhibited-pathways.tsv", output: "results/datavzrd/spia/{model}.yaml", log: "logs/yte/render-datavzrd-config-spia/{model}.log", + params: + pathway_db=config["enrichment"]["spia"]["pathway_database"], template_engine: "yte" @@ -53,8 +53,6 @@ rule spia_datavzrd: config="results/datavzrd/spia/{model}.yaml", # files required for rendering the given configs spia_table="results/tables/pathways/{model}.pathways.tsv", - spia_table_activated="results/tables/pathways/{model}.activated-pathways.tsv", - spia_table_inhibited="results/tables/pathways/{model}.inhibited-pathways.tsv", output: report( directory("results/datavzrd-reports/spia-{model}"), @@ -67,8 +65,7 @@ rule spia_datavzrd: log: "logs/datavzrd-report/spia-{model}/spia-{model}.log", wrapper: - # "v2.6.0/utils/datavzrd" - "v3.3.5-1-gd73914d/utils/datavzrd" + "v3.3.6/utils/datavzrd" rule diffexp_datavzrd: @@ -93,8 +90,7 @@ rule diffexp_datavzrd: log: "logs/datavzrd-report/diffexp.{model}/diffexp.{model}.log", wrapper: - # "v2.6.0/utils/datavzrd" - "v3.3.5-1-gd73914d/utils/datavzrd" + "v3.3.6/utils/datavzrd" rule go_enrichment_datavzrd: @@ -121,5 +117,4 @@ rule go_enrichment_datavzrd: log: "logs/datavzrd-report/go_enrichment-{model}/go_enrichment-{model}_{gene_fdr}.go_term_fdr_{go_term_fdr}.log", wrapper: - # "v2.6.0/utils/datavzrd" - "v3.3.5-1-gd73914d/utils/datavzrd" + "v3.3.6/utils/datavzrd" diff --git a/workflow/rules/enrichment.smk b/workflow/rules/enrichment.smk index 86890731..cb285c73 100644 --- a/workflow/rules/enrichment.smk +++ b/workflow/rules/enrichment.smk @@ -12,8 +12,6 @@ rule spia: spia_db="resources/spia-db.rds", output: table="results/tables/pathways/{model}.pathways.tsv", - table_activated="results/tables/pathways/{model}.activated-pathways.tsv", - table_inhibited="results/tables/pathways/{model}.inhibited-pathways.tsv", plots="results/plots/pathways/{model}.spia-perturbation-plots.pdf", params: bioc_species_pkg=bioc_species_pkg, diff --git a/workflow/scripts/spia.R b/workflow/scripts/spia.R index dbd41a5f..9289d23f 100644 --- a/workflow/scripts/spia.R +++ b/workflow/scripts/spia.R @@ -6,9 +6,8 @@ library("SPIA") library("graphite") library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE) -# provides library("tidyverse") and functions load_bioconductor_package() and -# get_prefix_col(), the latter requires snakemake@output[["samples"]] and -# snakemake@params[["covariate"]] +# provides library("tidyverse") and get_prefix_col(), where the latter requires +# snakemake@input[["samples"]] and snakemake@params[["covariate"]] source(snakemake@params[["common_src"]]) pw_db <- snakemake@params[["pathway_db"]] @@ -16,91 +15,102 @@ db <- readRDS(snakemake@input[["spia_db"]]) options(Ncpus = snakemake@threads) -diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>% - drop_na(ens_gene) %>% - mutate(ens_gene = str_c("ENSEMBL:", ens_gene)) -universe <- diffexp %>% pull(var = ens_gene) -sig_genes <- diffexp %>% filter(qval <= 0.05) +diffexp <- read_tsv(snakemake@input[["diffexp"]]) |> + drop_na(ens_gene) |> + mutate(ens_gene = str_c("ENSEMBL:", ens_gene)) + +universe <- diffexp |> + dplyr::select(ens_gene) |> + distinct() |> + pull(ens_gene) + +sig_genes <- diffexp |> + filter(qval <= 0.05) + +columns <- c( + "Name", + "number of genes on the pathway", + "number of DE genes per pathway", + "p-value for at least NDE genes", + "total perturbation accumulation", + "p-value to observe a total accumulation", + "Combined p-value", + "Combined FDR", + "Combined Bonferroni p-values", + "Status", + "pathway id" +) + if (nrow(sig_genes) == 0) { - cols <- c( - "Name", "Status", "Combined FDR", - "total perturbation accumulation", "number of genes on the pathway", - "number of DE genes per pathway", "p-value for at least NDE genes", - "Combined Bonferroni p-values", - "p-value to observe a total accumulation", "Combined p-value", "Ids" - ) - res <- data.frame(matrix(ncol = 11, nrow = 0, dimnames = list(NULL, cols))) - # create empty perturbation plots - pdf(file = snakemake@output[["plots"]]) - write_tsv(res, snakemake@output[["table"]]) - write_tsv(res, snakemake@output[["table_activated"]]) - write_tsv(res, snakemake@output[["table_inhibited"]]) - dev.off() + # the best hack for an empty tibble from a column specification I could find + res <- read_csv("\n", col_names = columns) + write_tsv(res, snakemake@output[["table"]]) + write_tsv(res, snakemake@output[["table_activated"]]) + write_tsv(res, snakemake@output[["table_inhibited"]]) } else { - # get logFC equivalent (the sum of beta scores of covariates of interest) + # get logFC equivalent (the sum of beta scores of covariates of interest) - beta_col <- get_prefix_col("b", colnames(sig_genes)) + beta_col <- get_prefix_col("b", colnames(sig_genes)) - beta <- sig_genes %>% - dplyr::select(ens_gene, !!beta_col) %>% - deframe() + beta <- sig_genes |> + dplyr::select(ens_gene, !!beta_col) |> + deframe() - t <- tempdir(check = TRUE) - olddir <- getwd() - setwd(t) - prepareSPIA(db, pw_db) - res <- runSPIA( - de = beta, all = universe, pw_db, - plots = TRUE, verbose = TRUE - ) - setwd(olddir) + t <- tempdir(check = TRUE) + olddir <- getwd() + setwd(t) + prepareSPIA(db, pw_db) + res <- runSPIA( + de = beta, + all = universe, + pw_db, + plots = TRUE, + verbose = TRUE + ) + setwd(olddir) - file.copy( - file.path(t, "SPIAPerturbationPlots.pdf"), - snakemake@output[["plots"]] - ) - pathway_names <- db[res$Name] - pathway_names <- db[res$Name] - path_ids <- as.matrix(lapply(pathway_names@entries, slot, "id")) - if (length(path_ids) > 0) { - path_ids_data_frame <- - data.frame(Ids = matrix(unlist(path_ids), - nrow = length(path_ids), byrow = TRUE - )) - final_res <- cbind(res, - Ids = path_ids_data_frame$Ids - ) - res_reorder <- dplyr::select( - final_res, Name, Status, - pGFdr, tA, pSize, NDE, pNDE, pGFWER, pPERT, pG, Ids - ) - res_reorder <- res_reorder %>% - rename( - "Combined Bonferroni p-values" = "pGFWER", - "Combined FDR" = "pGFdr", - "total perturbation accumulation" = "tA", - "number of genes on the pathway" = "pSize", - "number of DE genes per pathway" = "NDE", - "Combined p-value" = "pG", - "p-value to observe a total accumulation" = "pPERT", - "p-value for at least NDE genes" = "pNDE" - ) - write_tsv(res_reorder, snakemake@output[["table"]]) - sort_activated <- res_reorder[res_reorder$Status == "Activated", ] - sort_inhibited <- res_reorder[res_reorder$Status == "Inhibited", ] - write_tsv(sort_activated, snakemake@output[["table_activated"]]) - write_tsv(sort_inhibited, snakemake@output[["table_inhibited"]]) - } else { - columns <- c( - "Name", "Status", "Combined FDR", "total perturbation accumulation", - "number of genes on the pathway", "number of DE genes per pathway", - "p-value for at least NDE genes", "Combined Bonferroni p-values", - "p-value to observe a total accumulation", "Combined p-value", "Ids" + file.copy( + file.path(t, "SPIAPerturbationPlots.pdf"), + snakemake@output[["plots"]] + ) + pathway_names <- db[res$Name] + if (length(pathway_names) > 0) { + pathway_ids_tibble <- pathway_names@entries |> + map(slot, "id") |> + unlist() |> + as_tibble( + rownames="pathway_name" + ) |> + rename( + `pathway id` = value + ) + final_res <- as_tibble(res) |> + left_join( + pathway_ids_tibble, + join_by(Name == pathway_name) + ) |> + rename( + "number of genes on the pathway" = "pSize", + "number of DE genes per pathway" = "NDE", + "p-value for at least NDE genes" = "pNDE", + "total perturbation accumulation" = "tA", + "p-value to observe a total accumulation" = "pPERT", + "Combined p-value" = "pG", + "Combined FDR" = "pGFdr", + "Combined Bonferroni p-values" = "pGFWER" + ) |> + dplyr::select( + all_of( + columns ) - emtpy_data_frame <- data.frame(matrix(nrow = 0, ncol = length(columns))) - colnames(emtpy_data_frame) <- columns - write_tsv(emtpy_data_frame, snakemake@output[["table"]]) - write_tsv(emtpy_data_frame, snakemake@output[["table_activated"]]) - write_tsv(emtpy_data_frame, snakemake@output[["table_inhibited"]]) - } -} + ) |> + arrange( + desc(`total perturbation accumulation`) + ) + write_tsv(final_res, snakemake@output[["table"]]) + } else { + # the best hack for an empty tibble from a column specification I could find + emtpy_data_frame <- read_csv("\n", col_names = columns) + write_tsv(emtpy_data_frame, snakemake@output[["table"]]) + } +} \ No newline at end of file