assets/differentialabundance_report.Rmd

---
output:
    html_document:
        toc: true                               # table of contents
        toc_float: true                         # float the table of contents to the left of the main document content
        toc_depth: 4                            # header levels 1,2,3
        theme: default
        number_sections: false                  # add section numbering to headers
        df_print: paged                         # tables are printed as an html table with support for pagination over rows and columns
        highlight: pygments
        pdf_document: true
    pdf_document:
        toc: yes
date: "`r Sys.Date()`"
params:
    meta: NULL
    input_dir: NULL
    artifact_dir: NULL
    cpus: 1
    study_type: NULL
    study_name: NULL
    study_abundance_type: NULL
    report_file: NULL,
    report_title: NULL,
    report_contributors: NULL
    report_author: NULL,
    report_description: NULL,
    report_scree: NULL
    report_round_digits: NULL
    observations_type: NULL
    observations: NULL                                          # GSE156533.samplesheet.csv
    observations_id_col: NULL
    observations_name_col: NULL
    features: NULL
    features_type: NULL
    features_id_col: NULL
    features_name_col: NULL
    features_metadata_cols: NULL
    features_gtf_feature_type: NULL
    features_gtf_table_first_field: NULL
    exploratory_log2_assays: NULL
    raw_matrix: null                                            # e.g. 0_salmon.merged.gene_counts_length_scaled.tsv
    normalised_matrix: null
    variance_stabilised_matrix: null                            # e.g. test_files/3_treatment-WT-P23H.vst.tsv
    contrasts_file: null                                        # e.g. GSE156533.contrasts.csv
    differential_table: file.csv
    proteus_measurecol_prefix: NULL
    proteus_norm_function: NULL
    proteus_plotsd_method: NULL
    proteus_plotmv_loess: NULL
    proteus_palette_name: NULL
    affy_cel_files_archive: NULL
    affy_file_name_col: NULL
    affy_background: NULL
    affy_bgversion: NULL
    affy_destructive: NULL
    affy_cdfname: NULL
    affy_rm_mask: NULL
    affy_rm_outliers: NULL
    affy_rm_extra: NULL
    affy_build_annotation: NULL
    limma_ndups: NULL
    limma_spacing: NULL
    limma_block: NULL
    limma_correlation: NULL
    limma_method: NULL
    limma_proportion: NULL
    limma_stdev_coef_lim: NULL
    limma_trend: NULL
    limma_robust: NULL
    limma_winsor_tail_p: NULL
    limma_adjust_method: NULL
    limma_p_value: NULL
    limma_lfc: NULL
    limma_confint: NULL
    exploratory_n_features: null
    exploratory_clustering_method: null
    exploratory_cor_method: null
    exploratory_whisker_distance: null
    exploratory_mad_threshold: null
    exploratory_main_variable: null
    exploratory_assay_names: NULL
    exploratory_final_assay: NULL
    exploratory_palette_name: NULL
    versions_file: null                                         # e.g 17_software_versions.yml
    logo: null
    css: null
    citations: null
    filtering_min_samples: 1
    filtering_min_abundance: 1
    filtering_min_proportion: NULL
    filtering_grouping_var: NULL
    differential_file_suffix: NULL
    differential_feature_id_column: NULL
    differential_feature_name_column: NULL
    differential_fc_column: NULL
    differential_pval_column: NULL
    differential_qval_column: NULL
    differential_min_fold_change: NULL
    differential_foldchanges_logged: NULL
    differential_max_pval: NULL
    differential_max_qval: NULL
    differential_palette_name: NULL
    differential_subset_to_contrast_samples: NULL
    deseq2_test: NULL
    deseq2_fit_type: NULL
    deseq2_sf_type: NULL
    deseq2_min_replicates_for_replace: NULL
    deseq2_use_t: NULL
    deseq2_lfc_threshold: NULL
    deseq2_alt_hypothesis: NULL
    deseq2_independent_filtering: NULL
    deseq2_p_adjust_method: NULL
    deseq2_alpha: NULL
    deseq2_minmu: NULL
    deseq2_vs_method: NULL
    deseq2_shrink_lfc: NULL
    deseq2_cores: NULL
    deseq2_vs_blind: NULL
    deseq2_vst_nsub: NULL
    gsea_run: false
    gsea_nperm: NULL
    gsea_permute: NULL
    gsea_scoring_scheme: NULL
    gsea_metric: NULL
    gsea_sort: NULL
    gsea_order: NULL
    gsea_set_max: NULL
    gsea_set_min: NULL
    gsea_norm: NULL
    gsea_rnd_type: NULL
    gsea_make_sets: NULL
    gsea_median: NULL
    gsea_num: NULL
    gsea_plot_top_x: NULL
    gsea_rnd_seed: NULL
    gsea_save_rnd_lists: NULL
    gsea_zip_report: NULL
    gsea_chip_file: NULL
    gprofiler2_run: false
    gprofiler2_organism: NULL
    gprofiler2_significant: NULL
    gprofiler2_measure_underrepresentation: NULL
    gprofiler2_correction_method: NULL
    gprofiler2_sources: NULL
    gprofiler2_evcodes: NULL
    gprofiler2_max_qval: NULL
    gprofiler2_token: NULL
    gprofiler2_background_file: NULL
    gprofiler2_background_column: NULL
    gprofiler2_domain_scope: NULL
    gprofiler2_min_diff: NULL
    gprofiler2_palette_name: NULL
    gene_sets_files: NULL
---

<!-- Load libraries -->

```{r, include=FALSE}
library(knitr)
library(yaml)
library(shinyngs)
library(plotly)
library(DT)
```

<!-- Define some functions -->

```{r, include=FALSE}
round_dataframe_columns <- function(df, columns = NULL, digits = -1) {
    if (digits == -1) {
        return(df)                              # if -1, return df without rounding
    }

    df <- data.frame(df, check.names = FALSE)   # make data.frame from vector as otherwise, the format will get messed up
    if (is.null(columns)) {
        columns <- colnames(df)[(unlist(lapply(df, is.numeric), use.names=F))]    # extract only numeric columns for rounding
    }
    df[,columns] <- format(data.frame(df[, columns], check.names = FALSE), scientific=T, digits=params$report_round_digits)
    # Convert columns back to numeric

    for (c in columns) {
        df[[c]][grep("^ *NA$", df[[c]])] <- NA
        df[[c]] <- as.numeric(df[[c]])
    }
    df
}
```

```{r include = FALSE}
# Load the datatables js
datatable(NULL)
```

```{r, include=FALSE}
versions <- unlist(yaml.load_file(file.path(params$input_dir, params$versions_file)), recursive = FALSE)
params_table <- data.frame(Parameter = names(unlist(params)), Value = unlist(params), row.names = NULL)

# We'll subset the params table for different report sections
make_params_table <- function(name, pattern = NULL, remove_pattern = FALSE){
    subparams <- params_table
    if (! is.null(pattern)){
        subparams <- subparams[grep(pattern, subparams$Parameter),]
    }
    if (remove_pattern){
        subparams$Parameter <- sub(pattern, '', subparams$Parameter)
    }

    if (nrow(subparams) > 10){
        dom <- 'tp'
    }else{
        dom <- 't'
    }

    print( htmltools::tagList(datatable(subparams, caption = paste("Parameters used for", name), rownames = FALSE, options = list(dom = dom)) ))
}

report_title <- paste0('Differential ',  params$features_type, ' abundance report', ifelse(is.null(params$report_title), '', paste0(': ', params$report_title)))
report_subtitle <- paste0(ifelse(is.null(params$report_author), '', paste0('By ', params$report_author, ', ')), '<br>differentialabundance workflow version', versions[["Workflow.nf-core/differentialabundance"]])
```

---
title:  "<img src=\"`r file.path(params$input_dir, params$logo)`\" style=\"float: left;\"/>`r report_title`"
subtitle: `r report_subtitle`
---
\

<!-- set notebook defaults -->

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

<!-- Include the CSS and set the logo -->

```{r, echo=FALSE}
htmltools::includeCSS(params$css)
```

```{r results="asis", echo=FALSE}
cat(paste0("
<style>
#TOC {
    background-image: url(\"", knitr::image_uri(params$logo), "\");
}
</style>
"))
```

<!-- Include PI/contact info if provided -->

```{r, results='asis', echo=F, eval=!is.null(params$report_contributors)}
contributors <- gsub("\n", "<br>", params$report_contributors, fixed = TRUE)
contributors <- lapply(simpleSplit(contributors, ";"), function(s) {
    splt <- simpleSplit(s, "<br>")
    paste0("**", head(splt, 1), "**<br>", paste(tail(splt, -1), collapse = "<br>"))
})

for (r in seq_along(contributors)) {
    if (r %% 2 == 1) cat("<div class='div-row'>")
    cat(paste0("<div class='div-column'>", contributors[r], "</div>"))
    if (r %% 2 == 0 || r == length(contributors)) cat("</div>")
}
```

<!-- Load input data -->

```{r, echo=FALSE}
observations <- read_metadata(file.path(params$input_dir, params$observations), id_col = params$observations_id_col)
observations_name_col <- ifelse(!is.null(params$observations_name_col), params$observations_name_col, params$observations_id_col)
if (! observations_name_col %in% colnames(observations)){
    stop(paste('Invalid observation name column specified: ', observations_name_col, paste0('(Valid values are: ', paste(colnames(observations), collapse=', '),')')))
}

if (! is.null(params$features)){
    features <- read_metadata(file.path(params$input_dir, params$features))
    if (! is.null(params$features_metadata_cols)){
        features <- features[,colnames(features) %in% simpleSplit(params$features_metadata_cols), drop = FALSE]
    }
}

contrasts <- read_metadata(file.path(params$input_dir, params$contrasts_file))
contrasts$blocking <- na.replace(contrasts$blocking, '')
if (! 'id' %in% colnames(contrasts)){
    contrasts$id <- apply(contrasts, 1, paste, collapse='_')
}

# Identify informative variables- those with a number of values greater than 1
# but less than N, with N being the number of observations. Make sure contrast
# variables are first in the list

informative_variables <- unique(c(contrasts$variable, chooseGroupingVariables(observations)))

# Remove any informative variables that group observations the same way
informative_variables <- informative_variables[ ! duplicated(lapply(structure(informative_variables, names= informative_variables), function(x) as.numeric(factor(observations[[x]], levels=unique(observations[[x]])))))]

assay_names <- simpleSplit(params$exploratory_assay_names)
names(assay_names) = assay_names
assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]])

assay_data <- lapply(assay_files, function(x) {
    mat <- na.omit(
        read_matrix(
        x,
        sample_metadata = observations,
        row.names = 1
        )
    )
    colnames(mat) <- observations[[observations_name_col]][match(colnames(mat), rownames(observations))]
    mat
})

log2_assays <- params$exploratory_log2_assays
if (!is.null(log2_assays)) {
    # Remove brackets from assay list. TODO: remove if this is added to cond_log2_transform_assays
    log2_assays <- gsub('\\]$', '', gsub('^\\[', '', log2_assays))
}
assay_data <- cond_log2_transform_assays(assay_data, log2_assays, prettify_names = FALSE)

# Now we can rename the observations rows using the title field
rownames(observations) <- observations[[observations_name_col]]

# Run PCA early so we can understand how important each variable is
pca_datas <- lapply(names(assay_data), function(assay_type){
    compilePCAData(assay_data[[assay_type]])
})
names(pca_datas) <- names(assay_data)

pca_vs_meta <- anova_pca_metadata(pca_datas[[params$exploratory_final_assay]]$coords, observations[,informative_variables, drop = FALSE], pca_datas[[params$exploratory_final_assay]]$percentVar)

# Show the variable with the tightest PC associations first
informative_variables <- rownames(pca_vs_meta)[order(pca_vs_meta[,1])]

# Pick the variable used for coloring purposes etc
if (params$exploratory_main_variable == 'contrasts'){
    main_grouping_variable <- contrasts$variable[1]
}else if (params$exploratory_main_variable == 'auto_pca'){
    main_grouping_variable <- informative_variables[1]
}else{
    if (! params$exploratory_main_variable %in% colnames(observations)){
        stop(paste('Invalid main variable specified: ', params$exploratory_main_variable))
    }
    main_grouping_variable <- params$exploratory_main_variable
}

# Make sure the main variable is shown first, with remaining shown in order of
# informativeness

informative_variables <- unique(c(main_grouping_variable, informative_variables))

groupColorScale <- makeColorScale(length(unique(observations[[main_grouping_variable]])), palette = params$exploratory_palette_name)
```

<!-- Read the differential results.
NOTE: differential results files are expected to have the pattern:

<variable>-<reference>-<target>-<blocking><differential_file_suffix>, e.g.
treatment-mCherry-hND6-batcheffect.deseq2.results.tsv

... where variable, reference, target and blocking come from the contrasts file
(with blocking being optional) and the suffix is defined in parameters.
-->

```{r, echo=FALSE}
differential_file_suffix <- params$differential_file_suffix
if (is.null(differential_file_suffix)) {
    differential_file_suffix <- ifelse(params$study_type %in% c('rnaseq'), ".deseq2.results.tsv", ".limma.results.tsv")
}
differential_files <- lapply(contrasts$id, function(d){
    file.path(params$input_dir, paste0(gsub(' |;', '_', d), differential_file_suffix))
})

# Initialize vector to store warning messages before merging tables
warnings_list <- c()

# Read differential results and merge with features table
results <- lapply(differential_files, function(diff_file) {
    if (!file.exists(diff_file)) stop(paste("Differential file", diff_file, "does not exist"))

    diff <- read_differential(
        diff_file,
        feature_id_column = params$differential_feature_id_column,
        fc_column = params$differential_fc_column,
        pval_column = params$differential_pval_column,
        qval_column = params$differential_qval_column
    )

    # Log transform fold changes if not already logged
    if (!params$differential_foldchanges_logged) {
        diff[[params$differential_fc_column]] <- log2(diff[[params$differential_fc_column]])
    }

    # Annotate differential table if features table is provided
    if (!is.null(params$features)) {
        ## Merge Differential expression table on features table
        merged <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column)

        ## Get number of missing rows
        n_missing <- length(setdiff(diff[[params$differential_feature_id_column]], merged[[params$features_id_col]]))

        ## Create warnings if necessary
        warnings <- c(
            ## Missing IDs
            if (n_missing > 0) sprintf(
                '<p style="color:#DAA520;"><strong>WARNING:</strong> %d IDs from the differential table (%s) were lost on merge with features table (%s).</p>',
                n_missing, basename(diff_file), basename(params$features)
            ),
            ## Check whether there are fewer rows, missing data
            if (nrow(merged) < nrow(diff)) sprintf(
                '<p style="color:#DAA520;"><strong>WARNING:</strong> Rows were lost on merge (%s -> %s). Original: %d, Merged: %d.</p>',
                basename(diff_file), basename(params$features), nrow(diff), nrow(merged)
            ),
            ## Check whether there are more rows, possible duplications
            if (nrow(merged) > nrow(diff)) sprintf(
                '<p style="color:#DAA520;"><strong>WARNING:</strong> Rows were duplicated on merge (%s -> %s). Original: %d, Merged: %d.</p>',
                basename(diff_file), basename(params$features), nrow(diff), nrow(merged)
            )
        )
    } else {
        merged <- diff
        warnings <- character(0)
    }
    ## Collect results
    list(diff_features = merged, warnings = warnings)
})

# Separate differential_results and warnings_list from results
differential_results <- lapply(results, `[[`, "diff_features")
warnings_list <- unlist(lapply(results, `[[`, "warnings"))

names(differential_results) <- contrasts$id
```

<!-- Calculate some summary statistics -->

```{r, echo=FALSE}

# Function to make friendly contrast name from contrast components, including optional bits

name_contrast <- function(i){
    contrast_name <- paste(contrasts$target[i], 'versus', contrasts$reference[i], 'in', contrasts$variable[i])
    contrast_vals <- contrasts[i,]
    populated <- colnames(contrasts)[! (is.na(contrast_vals) | contrast_vals == '' | is.null(contrast_vals))]
    optional <- setdiff(populated, c('id', 'target', 'reference', 'variable'))

    if (length(optional) > 0){
        optional_part <-  paste0('(', paste(paste(optional, contrasts[i,optional], sep=': '), collapse=', '), ')')
    }else{
        optional_part <- ''
    }

    paste(contrast_name, optional_part)
}

contrast_descriptions <- unlist(lapply(1:nrow(contrasts), function(x) name_contrast(x)))

# Check both adjusted and unadjusted p values

p_value_types <- list(Adjusted = params$differential_qval_column, Unadjusted = params$differential_pval_column)
p_value_thresholds <- list(Adjusted = params$differential_max_qval, Unadjusted = params$differential_max_pval)

sig_differential <-
    lapply(names(p_value_types), function(pvt){
        diff <- lapply(
        1:nrow(contrasts),
        function(x){
            signif <- differential_results[[x]][,p_value_types[[pvt]] ] < p_value_thresholds[[pvt]]
            list(
            up = differential_results[[x]][which(
                differential_results[[x]][,params$differential_fc_column ] > log2(params$differential_min_fold_change) &
                signif
            ),],
            down = differential_results[[x]][which(
                differential_results[[x]][,params$differential_fc_column ] < log2(1/params$differential_min_fold_change) &
                signif
            ),]
            )
        }
        )
        names(diff) <- contrast_descriptions
        diff
    })
names(sig_differential) <- names(p_value_types)

# Count the differential genes
differential_tables <- lapply(names(sig_differential), function(sd) do.call(rbind, lapply(sig_differential[[sd]], function(x) lapply(x, function(y) nrow(y)))))
names(differential_tables) <- names(sig_differential)
```

<!-- Write the report -->

# Abstract

This report summarises differential `r params$features_type` analysis as performed by the nf-core/differentialabundance pipeline.

# Data

```{r, echo=FALSE, results='asis'}
cat(paste0("\n## ", ucfirst(params$observations_type), "s\n"))
```


A summary of `r params$observations_type` metadata is below:

```{r, echo=FALSE, results='asis'}
display_columns <- union(c(params$observations_id_col, unique(contrasts$variable)), informative_variables)
minimal_fetchngs_cols <- c('sample', 'sample_title', 'strandedness', 'library_strategy', 'scientific_name')

# If the data came via fetchngs then we can infer a couple of things about the most useful columns

if (all(minimal_fetchngs_cols %in% colnames(observations))){
    additional_useful_cols <- minimal_fetchngs_cols
}else{
    additional_useful_cols <- colnames(observations)[which(apply(observations, 2, function(x) max(nchar(x))) <= 20)]
}

display_columns <- head(union(display_columns, additional_useful_cols), 5)

# Also add informative columns
display_columns <- unique(c(display_columns, informative_variables))
observations_to_print <- observations[,unique(display_columns)]
colnames(observations_to_print) <- prettifyVariablename(colnames(observations_to_print))
print( htmltools::tagList(datatable(observations_to_print, caption = paste(ucfirst(params$observations_type), 'metadata'), rownames = FALSE, options = list(dom = 'tp')) ))
```

## Contrasts

Comparisons were made between `r params$observations_type` groups defined using `r params$observation_type` metadata columns, as described in the following table of contrasts:

```{r, echo=FALSE, results='asis'}
contrasts_to_print <- contrasts
colnames(contrasts_to_print) <- prettifyVariablename(colnames(contrasts_to_print))

# Add design/model formulae to report
de_tool <- ifelse(params$study_type %in% c('rnaseq'), "deseq2", "limma")
contrasts_to_print$model <- sapply(contrasts_to_print$Id, function(id) {
    model_file <- paste0(id, ".", de_tool, ".model.txt")
    if (file.exists(model_file)) {
        first_line <- readLines(model_file, n = 1)
        return(first_line)
    } else {
        return(NA)
    }
})

print( htmltools::tagList(datatable(contrasts_to_print, caption = paste0("Table of contrasts"), rownames = FALSE, options = list(dom = ifelse(nrow(contrasts_to_print) > 10, 'tp', 't'))) ))
```

# Results

## Counts

Input was a matrix of `r nrow(assay_data$raw)` `r params$features_type`s for `r ncol(assay_data$raw)` `r params$observations_type`s`r ifelse(nrow(assay_data$normalised) < nrow(assay_data$raw), paste0(', reduced to ', nrow(assay_data$normalised), ' ', params$features_type, 's after filtering for low abundance'), '')`.

## Exploratory analysis

### Abundance value distributions

The following plots show the abundance value distributions of input matrices. A log2 transformation is applied where not already performed.

#### Box plots

```{r, echo=FALSE, results='asis', fig.height=8}

p <- ggplot_boxplot(
    assay_data,
    experiment = observations,
    colorby = main_grouping_variable,
    expressiontype = paste("count per", params$features_type),
    palette = groupColorScale,
    whisker_distance = params$exploratory_whisker_distance,
    base_size=8
)
print(p)
```

Whiskers in the above boxplots show `r params$exploratory_whisker_distance` times the inter-quartile range.

#### Density plots

```{r, echo=FALSE, results='asis', fig.height=8}
plotly_densityplot(
    assay_data,
    experiment = observations,
    colorby = observations_name_col,
    expressiontype = paste("count per", params$features_type),
    makeColorScale(length(unique(observations[[params$observations_id_col]])), palette = "Set1")
)
```

```{r, echo=FALSE, results='asis'}
cat(paste0("\n### ", ucfirst(params$observations_type), " relationships\n"))
```

#### Principal components plots {.tabset}

Principal components analysis was conducted based on the `r params$exploratory_n_features` most variable `r params$features_type`s. Each component was annotated with its percent contribution to variance.

```{r, echo=FALSE, results='asis'}
# Create nested list to save the percentVars for reusing in the scree plot
percentVar_list <- list()
for (assay_type in rev(names(assay_data))){

    pca_data <- pca_datas[[assay_type]]
    for (iv in informative_variables){

        cat(paste0("\n##### ", prettifyVariablename(assay_type), " (", iv, ")\n"))

        plotdata <- pca_data$coords
        plotdata$colorby <- factor(
        observations[[iv]],
        levels = unique(observations[[iv]])
        )
        pcaColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name)

        # Make plotting data combining PCA coords with coloring groups etc

        plotdata$name <- rownames(plotdata)
        percentVar <- pca_data$percentVar
        labels <- paste0(colnames(plotdata), " (", sprintf("%.1f", percentVar), "%)")
        ncats <- length(unique(plotdata$colorby))

        plot_types <- list("2" = "scatter", "3" = "scatter3d")

        for (d in names(plot_types)) {

        # Default plot args whatever we're doing

        plot_args <- list(
            x = pca_data$coords[, 1],
            y = pca_data$coords[, 2],
            xlab = labels[1],
            ylab = labels[2],
            colorby = plotdata$colorby,
            plot_type = plot_types[[d]],
            palette = pcaColorScale,
            legend_title = prettifyVariablename(iv),
            labels = plotdata$name,
            show_labels = TRUE
        )

        if (d == "3") {
            plot_args$z <- pca_data$coords[, 3]
            plot_args$zlab <- labels[3]
        }

        print(htmltools::tagList(do.call("plotly_scatterplot", plot_args)))
        }
        if (! assay_type %in% names(percentVar_list)){
        percentVar_list[[assay_type]] <- percentVar
        }
    }
}
```

```{r, echo=FALSE, results='asis', eval=params$report_scree}
cat(paste0("\n#### Scree plot {.tabset}"))
cat(paste0("\nThe following scree plot visualizes what percentage of total variation in the data can be explained by each of the principal components computed.\n"))
#iv <- informative_variables[1]

for (assay_type in names(percentVar_list)) {
    percentVarData <- data.frame(percentVar_list[[assay_type]])
    colnames(percentVarData) <- c("var_explained")
    percentVarData$PCA <- as.numeric(rownames(percentVarData))
    cat(paste0("\n##### ", prettifyVariablename(assay_type), "\n"))
    print(
        ggplot(percentVarData, aes(x=factor(PCA),y=var_explained, group=1)) +
        theme_bw() +
        geom_point(size=4) +
        geom_line(linetype="dashed") +
        xlab("PC") +
        ylab("Percent variance explained")
    )
    cat("\n")
}
```

#### Principal components/ metadata associations

For the variance stabilised matrix, an ANOVA test was used to determine assocations between continuous principal components and categorical covariates (including the variable of interest).

The resulting p values are illustrated below.

```{r, echo=FALSE, results='asis'}

# This is a little hack to work around a bug in d3heatmap with single-row data
# frames.
if (nrow(pca_vs_meta) == 1){
    plot_pca_meta <- rbind(pca_vs_meta, pca_vs_meta)
}else{
    plot_pca_meta <- pca_vs_meta
}

d3heatmap::d3heatmap(
    -log10(plot_pca_meta),
    Rowv = FALSE,
    dendrogram = 'none',
    cellnote = plot_pca_meta,
    cexCol = 0.8,
    cexRow = 0.8,
    height = (100 + (15 * nrow(plot_pca_meta))),
    colors = colorRampPalette(
        rev(
        RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")
        )
    )(100)
)

for (variable in rownames(pca_vs_meta)){
    sig_comps <- pca_vs_meta[variable,] < 0.1

    if (any(sig_comps)){
        min_sig_comp <- min(which(sig_comps))

        min_sig_comp_p <- sprintf("%.2f", pca_vs_meta[variable, min_sig_comp])
        cat(paste0('The variable \'', variable, '\' shows an association with ', colnames(pca_vs_meta)[min_sig_comp], ' (p = ', min_sig_comp_p,'). '))
    }
}
```

#### Clustering dendrograms {.tabset}

A hierarchical clustering of `r params$features_type`s was undertaken based on `r ifelse(params$exploratory_n_features == -1, paste0("all ", params$features_type), paste0("the ", params$exploratory_n_features, " most variable ", params$features_type))`s. Distances between `r params$features_type`s were estimated based on `r params$exploratory_cor_method` correlation, which were then used to produce a clustering via the `r params$exploratory_clustering_method` method with `hclust()` in R.

```{r, echo=FALSE, results='asis'}
for (assay_type in rev(names(assay_data))){
    for (iv in informative_variables){
        cat(paste0("\n##### ", prettifyVariablename(assay_type), " (", iv, ")\n"))
        variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = ifelse(params$exploratory_n_features == -1, nrow(assay_data[[assay_type]]), params$exploratory_n_features))

        dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name)
        p <- clusteringDendrogram(
        2^assay_data[[assay_type]][variable_genes, ],
        observations[, iv, drop = FALSE],
        colorby = iv,
        cor_method = params$exploratory_cor_method,
        plot_title = paste0(
            paste0(params$observations_type," clustering dendrogram, "),
            ifelse(params$exploratory_n_features == -1, nrow(assay_data[[assay_type]]), paste0(params$exploratory_n_features, " most variable")), " ",
            params$features_type,
            "s\n(", params$exploratory_clustering_method, " clustering, ", params$exploratory_cor_method, " correlation)"),
        cluster_method = params$exploratory_clustering_method,
        palette = dendroColorScale,
        labelspace = 0.25
        )
        # Defaults in shinyngs make the text in this plot a bit big for the report, so
        # scale it down a bit
        print(p, vp=grid::viewport(gp=grid::gpar(cex=0.7)))
        cat("\n")
    }
}
```

```{r, echo=FALSE, results='asis', warning=FALSE}

# We can't look for ouliers in sets of less than 3 samples, so exclude variables
# unless the minimum group size is larger than that
iv_min_group_sizes <- unlist(lapply(informative_variables, function(x) min(table(observations[[x]]))))

if (any(iv_min_group_sizes > 2)){
    cat("\n### Outlier detection {.tabset}\n")
    cat("\nOutlier detection based on [median absolute deviation](https://archive.ph/o3thZ) was undertaken, the outlier scoring is plotted below. For more on MAD, see [this wiki article](https://en.wikipedia.org/wiki/Median_absolute_deviation).\n")
}

foo <- lapply(informative_variables[iv_min_group_sizes > 2], function(iv){

    cat(paste("\n####", iv, "\n"))

    plotdata <-
    madScore(
        matrix = assay_data[[params$exploratory_final_assay]],
        sample_sheet = observations,
        groupby = iv
    )

    if (! is.null(plotdata)){
        mad_plot_args <- list(
            x = plotdata$group,
            y = plotdata$mad,
            color = plotdata$outlier,
            hline_thresholds = c("Outlier threshold" = params$exploratory_mad_threshold),
            palette = makeColorScale(2, palette = params$differential_palette_name),
            legend_title = "Outlier status",
            labels = rownames(plotdata),
            show_labels = TRUE,
            xlab = "Sample group",
            ylab = "MAD score"
        )

        print(htmltools::tagList(do.call("plotly_scatterplot", mad_plot_args)))

        outliers <- rownames(plotdata)[plotdata$outlier]

        if (length(outliers) == 0){
            cat(paste0("No outlying samples were detected in groups defined by ", iv,".\n"))
        }else{
            cat(paste0(length(outliers), ' possible outliers were detected in groups defined by ', iv ,': ', paste(outliers, collapse=', '), "\n"))
        }
    }
})
```

## Differential analysis

```{r, echo=FALSE, results='asis', eval=params$study_type %in% c('rnaseq')}
# For DESeq2, add some more explanation to the report
cat(paste0(
"The `DESeq2 R` package was used for differential analysis. p-values were adjusted with the ", params$deseq2_p_adjust_method, " method to reduce the number of false positives. ", ucfirst(params$features_type), "s were considered differential if, for the respective contrast, the adjusted p-value was equal to or lower than ", params$deseq2_alpha, " and the absolute log2 fold change was equal to or higher than ", params$deseq2_lfc_threshold, "."
))
```

### Differential `r params$features_type` `r params$study_abundance_type` {.tabset}

```{r, echo=FALSE, results='asis'}
foo <- lapply(names(p_value_types), function(pvt){
    cat("\n#### ", pvt, "\n")
    print( htmltools::tagList(datatable(differential_tables[[pvt]], caption = paste0('Differential ',  params$features_type, " ", params$abundance_type, ' (target relative to reference)'), options = list(dom = 't'), rownames = TRUE) ))
    cat("\n")
})
```

```{r, echo=FALSE, results='asis', eval = FALSE}
differential_summary_string <- paste(
    paste(
    lapply(
        1:nrow(contrasts),
        function(x){
        paste0(
            "Contrast ", x, ' (', contrast_descriptions[x], ') ', "had ", differential_table[x,'up'], ' ', paste0(params$features_type, 's'), ' expressed significantly more highly in ', contrasts[x, 'target',], ' than ', contrasts[x, 'reference',], ' and ', differential_table[x,'down'], ' expressed at sifnificantly lower levels.'
        )
        }
    ),
    collapse = ' '
    )
)
cat(differential_summary_string)
```

### Differential `r params$features_type` details

```{r, echo=FALSE, results='asis', warning=FALSE, message=FALSE}

# Display all warnings related to number of rows
if (length(warnings_list) > 0) {
    for (warning in warnings_list) { cat(warning) }
}

for (i in 1:nrow(contrasts)){
    cat("\n#### ", contrast_descriptions[i], "  {.tabset}\n")

    ## Make a volcano plot for the contrast first

    # Label features with symbol as well as identifier
    if (! is.null(params$features) && (! is.null(params$differential_feature_name_column)) ){
        label_col <- params$differential_feature_name_column
    }else{
        label_col <- params$differential_feature_id_column
    }

    # Get the full set of differential stats for this contrast, removing rows with
    # NAs in the fields we need.
    full_de <- differential_results[[i]]
    full_de <- subset(full_de, (! is.na(full_de[[params$differential_fc_column]])) & (! is.na(full_de[[params$differential_qval_colum]])) )

    # We'll color by whether features are differential according to supplied thresholds

    p_value_types <- list(Adjusted = params$differential_qval_column, Unadjusted = params$differential_pval_column)
    p_value_thresholds <- list(Adjusted = params$differential_max_qval, Unadjusted = params$differential_max_pval)

    for (pvt in names(p_value_types)){
        cat("\n##### ", pvt, " p values\n")
        pval_column <- p_value_types[[pvt]]

        de_fc <- abs(full_de[[params$differential_fc_column]]) >= abs(log2(params$differential_min_fold_change))
        de_fc_label <- paste("abs(logFC) >=", log2(params$differential_min_fold_change))

        de_pval <- full_de[[pval_column]] <= p_value_thresholds[[pvt]]
        de_pval_label <- paste(pvt, "<=", p_value_thresholds[[pvt]])

        de_pval_fc_label <- paste(de_fc_label, '&', de_pval_label)

        full_de$differential_status <- "Not significant"
        full_de$differential_status[de_fc] <- de_fc_label
        full_de$differential_status[de_pval] <- de_pval_label
        full_de$differential_status[de_fc & de_pval] <- de_pval_fc_label
        full_de$differential_status <- factor(full_de$differential_status, levels = c("Not significant", de_fc_label, de_pval_label, de_pval_fc_label), ordered = TRUE) # Factorize status so that non-significant is always first
        # Define the thresholds we'll draw

        hline_thresholds = vline_thresholds = list()
        hline_thresholds[[paste(pval_column, '=', p_value_thresholds[[pvt]])]] = -log10(p_value_thresholds[[pvt]])
        vline_thresholds[[paste(params$differential_fc_column, '<=', log2(params$differential_min_fold_change))]] = -log2(params$differential_min_fold_change)
        vline_thresholds[[paste(params$differential_fc_column, '>=', log2(params$differential_min_fold_change))]] = log2(params$differential_min_fold_change)

        palette_volcano <- append(c('#999999'), makeColorScale(3, params$differential_palette_name)) # set non-significant to gray

        plot_args <- list(
        x = full_de[[params$differential_fc_column]],
        y = -log10(full_de[[pval_column]]),
        colorby = full_de$differential_status,
        ylab = paste("-log(10)", pval_column),
        xlab = xlabel <- paste("higher in", contrasts$reference[i], "          <<", params$differential_fc_column, ">>           higher in", contrasts$target[i]),
        labels = full_de[[label_col]],
        hline_thresholds = hline_thresholds,
        vline_thresholds = vline_thresholds,
        show_labels = FALSE,
        legend_title = "Differential status",
        palette = palette_volcano
        )

        # Let's equalize the axes
        max_fc <- max(abs(full_de[[params$differential_fc_column]])) * 1.1

        # Print warning if any p values are 0
        zero_p <- length(which(full_de[[pval_column]]==0))
        if (zero_p) {
            cat(paste0("<i>", zero_p, " feature", ifelse(zero_p>1, "s are", " is"), " not shown because of p value = 0; please refer to the results tables.</i><br><br>"))
        }

        p <- do.call(plotly_scatterplot, plot_args) %>%
        layout(xaxis = list(range=list(-max_fc, max_fc)))

        print(htmltools::tagList(p))

        ## ... then show tables of the up/ down genes

        for (dir in c('up', 'down')){
        contrast_de <- sig_differential[[pvt]][[i]][[dir]]
        cols_to_round <- c(params$differential_fc_column, params$differential_pval_column, params$differential_qval_column)
        contrast_de[, cols_to_round] <- signif(contrast_de[, cols_to_round], 8)

        colnames(contrast_de) <- prettifyVariablename(colnames(contrast_de))

        if (nrow(contrast_de) > 0){
            contrast_de <- round_dataframe_columns(contrast_de, digits=params$report_round_digits)
            print( htmltools::tagList(datatable(contrast_de, caption = paste('Differential genes', dir, 'in', contrast_descriptions[i], " (check", differential_files[[i]], "for more detail)"), rownames = FALSE) ))

        if ("Gene biotype" %in% colnames(contrast_de)) {
            # Plot Differentially Expressed Genes by Gene Biotype
            gene_biotype_table <- contrast_de %>%
                group_by(`Gene biotype`) %>%
                summarise(count = dplyr::n(), .groups = 'drop') %>%
                filter(count > 0) %>%
                arrange(desc(count))

            gene_biotype_plot <- ggplot(gene_biotype_table, aes(x = reorder(`Gene biotype`, -count), y = count)) +
                geom_bar(stat = "identity", position = position_dodge()) +
                labs(
                    title = paste0("Differentially Expressed Genes by Gene Biotype (", dir, ")"),
                    x = "Gene Biotype",
                    y = "Number of Differentially Expressed Genes"  ) +
                theme_minimal() +
                theme(axis.text.x = element_text(angle = 45, hjust = 1))

            print(gene_biotype_plot)
        } else {
            cat("Column 'Gene biotype' does not exist. Skipping plot.\n")
        }
        }else{
            cat(paste0("No significantly differential '", dir, "' genes.\n\n"))
        }
        }
    }
}
```

<!-- Gene set analysis results -->

```{r, echo=FALSE, results='asis'}
possible_gene_set_methods <- c('gsea', 'gprofiler2')
if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
    cat("\n### Gene set analysis\n")

    for (gene_set_method in possible_gene_set_methods){
        if (unlist(params[paste0(gene_set_method, '_run')])){
        cat("\n#### ", toupper(gene_set_method) ," {.tabset}\n")
        if (gene_set_method == 'gsea') {
            for (gmt_file in simpleSplit(params$gene_sets_files)) {
            gmt_name <- basename(tools::file_path_sans_ext(gmt_file))
            cat("\n##### ", gmt_name ," {.tabset}\n")

            reference_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$reference, '.tsv')
            target_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$target, '.tsv')
            for (i in 1:nrow(contrasts)){
                cat("\n###### ", contrast_descriptions[i], "\n")
                target_gsea_results <- read_metadata(target_gsea_tables[i])[,c(-2,-3)]
                target_gsea_results <- round_dataframe_columns(target_gsea_results, digits=params$report_round_digits)
                print( htmltools::tagList(datatable(target_gsea_results, caption = paste0("\nTarget (", contrasts$target[i], ")\n"), rownames = FALSE) ))
                ref_gsea_results <- read_metadata(reference_gsea_tables[i])[,c(-2,-3)]
                ref_gsea_results <- round_dataframe_columns(ref_gsea_results, digits=params$report_round_digits)
                print( htmltools::tagList(datatable(ref_gsea_results, caption = paste0("\nReference (", contrasts$reference[i], ")\n"), rownames = FALSE) ))
            }
            }

        } else if (gene_set_method == 'gprofiler2') {

            cat(paste0("\nThis section contains the results tables of the pathway analysis which was done with the R package gprofiler2. The differential fraction is the number of differential genes in a pathway divided by that pathway's size, i.e. the number of genes annotated for the pathway.",
            ifelse(params$gprofiler2_significant, paste0(" Enrichment was only considered if significant, i.e. adjusted p-value <= ", params$gprofiler2_max_qval, "."), "Enrichment was also considered if not significant."), "\n"))

            # Make sure to grab only non-empty files
            for (i in 1:nrow(contrasts)) {
            cat(paste0("\n##### ", contrasts$id[i], "\n"))

            table <- paste0(contrasts$id[i], ".gprofiler2.all_enriched_pathways.tsv")
            table_path <- file.path(params$input_dir, table)
            if (!file.exists(table_path) || file.size(table_path) == 0){
                cat(paste0("No ", ifelse(params$gprofiler2_significant, "significantly", ""), " enriched pathways were found for this contrast."))
            } else {
                all_enriched <- read.table(table_path, header=T, sep="\t", quote="\"")
                all_enriched <- data.frame("Pathway name" = all_enriched$term_name, "Pathway code" = all_enriched$term_id,
                "Differential features" = all_enriched$intersection_size, "Pathway size" = all_enriched$term_size,
                "Differential fraction" = (all_enriched$intersection_size/all_enriched$term_size),
                "Adjusted p value" = all_enriched$p_value, check.names = FALSE)
                all_enriched <- round_dataframe_columns(all_enriched, digits=params$report_round_digits)
                print(htmltools::tagList(datatable(all_enriched, caption = paste('Enriched pathways in', contrasts$id[i], " (check", table, "for more detail)"), rownames = FALSE)))
            }
            cat("\n")
            }
        }
        }
    }
}
```

# Methods

```{r, echo=FALSE, results='asis', eval=params$study_type == 'maxquant'}
cat(paste0("\n## Protein abundance import\n"))
make_params_table('importing maxquant output', 'proteus_', remove_pattern = TRUE)
```

## Filtering

```{r, echo=FALSE, results='asis'}
make_params_table('feature-wise filtering', 'filtering_', remove_pattern = TRUE)
```

```{r, echo=FALSE, results='asis'}
filtering_string <- paste0('Filtering was carried out by selecting ', params$features_type, 's with an abundance of at least ', params$filtering_min_abundance)

if (is.null(params$filtering_grouping_var)){
    if (is.null(params$filtering_min_proportion)){
        filtering_string <- paste0(filtering_string, ' in at least ', params$filtering_min_samples, ' ', params$observations_type, 's.')
    }else{
        filtering_string <- paste0(filtering_string, ' in at least a proportion of ', params$filtering_min_proportion, ' of ', params$observations_type,'s.')
    }
}else{
    if (is.null(params$filtering_min_proportion)){
        filtering_string <- paste0(filtering_string, ' in at least the number of ', params$observations_type, 's corresponding to the smallest group size defined by the grouping variable "', params$filtering_grouping_var, '".')
    }else{
        filtering_string <- paste0(filtering_string, ' in at least a proportion of ', params$filtering_min_proportion, ' of the number of ', params$observations_type,'s corresponding to the smallest group size defined by the grouping variable"', params$filtering_grouping_var, '".')
    }
}
cat(filtering_string)
```

## Exploratory analysis

```{r, echo=FALSE, results='asis'}
make_params_table('exploratory analysis', 'exploratory_', remove_pattern = TRUE)
```

## Differential analysis


```{r, echo=FALSE, results='asis'}
if (params$study_type == 'rnaseq'){
    make_params_table('DESeq2', 'deseq2_', remove_pattern = TRUE)
}
make_params_table('downstream differential analysis', 'differential_', remove_pattern = TRUE)
```

<!-- If any gene set methods have been activated show their params -->

```{r, echo=FALSE, results='asis'}
possible_gene_set_methods <- c('gsea', 'gprofiler2')

if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
    cat("\n### Gene set analysis\n")

    for (gene_set_method in possible_gene_set_methods){
        if (unlist(params[paste0(gene_set_method, '_run')])){
        cat("\n#### ", toupper(gene_set_method) ,"  {.tabset}\n")
        make_params_table(toupper(gene_set_method), paste0(gene_set_method, '_'), remove_pattern = TRUE)
        }
    }

}
```

# Appendices

## All parameters

```{r, echo=FALSE, results='asis'}
print( htmltools::tagList(datatable(params_table, caption = "All parameters", rownames = FALSE) ))
```

## Software versions

**Note:** For a more detailed accounting of the software and commands used (including containers), consult the execution report produced as part of the 'pipeline info' for this workflow.

```{r, echo=FALSE, results='asis'}
versions_table <- data.frame(do.call(rbind, strsplit(names(versions), split = '\\.')), unlist(versions))
colnames(versions_table) <- c('Component', 'Software', 'Version')
print( htmltools::tagList(datatable(versions_table, caption = "Software versions", rownames = FALSE, options = list(dom = 'ft', paging = FALSE)) ))
```

```{r, echo=FALSE, results='asis'}
htmltools::includeMarkdown(params$citations)
```