workflow4metabolomics · hechth · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/tools/batch_correction/.shed.yml b/tools/batch_correction/.shed.yml
@@ -0,0 +1,8 @@
+name: batchcorrection
+owner: melpetera
+description: '[W4M][Metabolomics][LC-MS] Correction of data intensities for signal drift and batch-effects.'
+homepage_url: http://workflow4metabolomics.org
+long_description: 'Instrumental drift and offset differences between batches have been described in LC-MS experiments when the number of samples is large and/or multiple batches of acquisition are needed. Recently a normalization strategy relying on the measurements of a "pooled" (or QC) sample injected periodically has been described: for each variable, a regression model is fitted to the values of the "pool" and subsequently used to adjust the intensities of the samples of interest (van der Kloet et al, 2009; Dunn et al, 2011). The current repository contains two modules: "Determine batch correction" and "Batch correction". The "Batch correction" module provides two strategies for normalization: variables can be either first checked to assess which of them should be corrected (in that case the "Determine Batch Correction" module provides the information about the correction which will be applied), or all variables can be corrected ("all loess" options). In the latter case, it is possible to fit the model on the samples instead of the pools. Output figures and files are provided to assess the quality of the normalization.' 
+remote_repository_url: https://github.com/workflow4metabolomics/batch_correction.git
+categories:
+- Metabolomics
diff --git a/tools/batch_correction/CHANGELOG.md b/tools/batch_correction/CHANGELOG.md
@@ -0,0 +1,48 @@
+
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [3.0.0] - 2025-07-10
+### Added
+- Specific names for the 'sampleType', 'injectionOrder', and 'batch' from sampleMetadata are now available in a dedicated parameter section.
+- Addition of a sum of ions before/after plot for linear/lowess/loess methods.
+- Addition of a third option in "Null values" parameter (renamed "unconsistant values") in linear/lowess/loess methods.
+- linear/lowess/loess methods now handle NA in intensities and allow "blank" samples in the dataset.
+
+### Changed
+- XML optimisation using macros.
+- Output name changes.
+- linear/lowess/loess methods: disabling of RData output.
+- linear/lowess/loess methods: split of tool-linked code and script-linked one.
+- linear/lowess/loess methods: adjustments in the normalisation process to match matters linked to NA acceptance.
+- linear/lowess/loess methods: better handling of special characters in IDs and column names.
+
+## [2.2.4] - 2024-xx-xx
+### Fixed
+- Fixed bug for pool selection ("all_loess" methods).
+
+## [2.2.2] - 2024-xx-xx
+### Fixed
+- Fixed bug for color plot ("all_loess" methods).
+
+## [2.2.0] - 2024-xx-xx
+### Added
+- Specific names for the 'sampleType', 'injectionOrder', and 'batch' from sampleMetadata can be selected by the user (for compatibility with the MTBLS downloader).
+
+## [2.1.2] - 2024-xx-xx
+### Changed
+- Minor modifications in config file.
+
+## [2.1.0] - 2024-xx-xx
+### Changed
+- For PCA figure display only (**all_loess** options): missing values are set to the minimum value before PCA computation is performed (with svd).
+- Additional running and installation tests added with planemo, conda, and travis.
+
+### Fixed
+- Variables with NA or 0 values in all reference samples are discarded before applying the **all_loess** normalization.
+
+### Changed
+- Modifications of the **all_loess_wrapper** file to handle the recent **ropls** package versions (i.e. 1.3.15 and above) which use S4 classes.
diff --git a/tools/batch_correction/batch_correction.xml b/tools/batch_correction/batch_correction.xml
diff --git a/tools/batch_correction/batch_correction_3Lfct.R b/tools/batch_correction/batch_correction_3Lfct.R
diff --git a/tools/batch_correction/batch_correction_3Llauncher.R b/tools/batch_correction/batch_correction_3Llauncher.R
@@ -0,0 +1,166 @@
+###############################################################################################################
+# batch_correction_3Llauncher                                                                                 #
+#                                                                                                             #
+# Authors: Jean-Francois MARTIN / Melanie Petera                                                              #
+# Starting date: 04-08-2020                                                                                   #
+# Based on batch_correction_wrapper.R version 2.91                                                            #
+# Version 1: 02-10-2020                                                                                       #
+#            - split of tool-linked code and script-linked one                                                #
+#            - handling of sample tags' parameters                                                            #
+#            - accepting samples beyond pools and samples                                                     #
+#            - dealing with special characters in IDs and column names                                        #
+#            - adding a min.norm argument to the function                                                     #
+#                                                                                                             #
+# Input files: dataMatrix.txt, sampleMetadata.txt, variableMetadata.txt (BC only)                             #
+# Output files: graph.pdf, corrected table (BC only), diagnostic table (DBC only), variableMetadata (BC only) #
+#                                                                                                             #
+###############################################################################################################
+
+meth3L <- function(idsample, iddata, sample_type_col_name, injection_order_col_name, batch_col_name, sample_type_tags,
+                   factbio, analyse, metaion, detail, method, outlog, span, valnull,
+                   rdata_output, dataMatrix_out, variableMetadata_out, out_graph_pdf, out_preNormSummary, min.norm) {
+    ## Import function
+    tab.import <- function(tested.file, tabtype) {
+        tab.res <- tryCatch(read.table(tested.file, header = TRUE, sep = "\t", check.names = FALSE, comment.char = ""), error = conditionMessage)
+        if (length(tab.res) == 1) {
+            stop(paste("Could not import the", tabtype, "file. There may be issues in your table integrity.\nCorresponding R error message:\n", tab.res))
+        } else {
+            tab.comp <- tryCatch(read.table(tested.file, header = TRUE, sep = "\t", check.names = FALSE, comment.char = "", quote = ""), error = conditionMessage)
+            if ((length(tab.comp) != 1) && (dim(tab.res) != dim(tab.comp))) { # wrong original import due to quotes inside a column name
+                return(tab.comp)
+            } else {
+                return(tab.res)
+            }
+        }
+    }
+
+    ## Reading of input files
+    idsample <- tab.import(idsample, "sampleMetadata")
+    iddata <- tab.import(iddata, "dataMatrix")
+
+    ### Table match check
+    table.check <- match2(iddata, idsample, "sample")
+    if (length(table.check) > 1) {
+        check_err(table.check)
+    }
+
+    ### StockID
+    samp.id <- stock_id(iddata, idsample, "sample")
+    iddata <- samp.id$dataMatrix
+    idsample <- samp.id$Metadata
+    samp.id <- samp.id$id.match
+
+    ### Checking mandatory variables
+    mand.check <- ""
+    for (mandcol in c(sample_type_col_name, injection_order_col_name, batch_col_name)) {
+        if (!(mandcol %in% colnames(idsample))) {
+            mand.check <- c(
+                mand.check, "\nError: no '", mandcol, "' column in sample metadata.\n",
+                "Note: column names are case-sensitive.\n"
+            )
+        }
+    }
+    if (length(mand.check) > 1) {
+        mand.check <- c(
+            mand.check, "\nFor more information, see the help section or:",
+            "\n http://workflow4metabolomics.org/sites/",
+            "workflow4metabolomics.org/files/files/w4e-2016-data_processing.pdf\n"
+        )
+        check_err(mand.check)
+    }
+
+    if (analyse == "batch_correction") {
+        ## Reading of Metadata Ions file
+        metaion <- read.table(metaion, header = T, sep = "\t", check.names = FALSE, comment.char = "")
+        ## Table match check
+        table.check <- c(table.check, match2(iddata, metaion, "variable"))
+        ## StockID
+        var.id <- stock_id(iddata, metaion, "variable")
+        iddata <- var.id$dataMatrix
+        metaion <- var.id$Metadata
+        var.id <- var.id$id.match
+    }
+
+    ### Formating
+    idsample[[1]] <- make.names(idsample[[1]])
+    dimnames(iddata)[[1]] <- iddata[[1]]
+
+    ### Transposition of ions data
+    idTdata <- t(iddata[, 2:dim(iddata)[2]])
+    idTdata <- data.frame(dimnames(idTdata)[[1]], idTdata)
+
+    ### Merge of 2 files (ok even if the two dataframe are not sorted on the same key)
+    ids <- merge(idsample, idTdata, by.x = 1, by.y = 1)
+
+    ids[[batch_col_name]] <- as.factor(ids[[batch_col_name]])
+    nbid <- dim(idsample)[2]
+
+    ### Checking the number of sample and pool
+
+    # least 2 samples
+    if (length(which(ids[[sample_type_col_name]] %in% sample_type_tags$sample)) < 2) {
+        table.check <- c(
+            table.check, "\nError: less than 2 samples specified in sample metadata.",
+            "\nMake sure this is not due to errors in your ", sample_type_col_name, " coding.\n"
+        )
+    }
+
+    # least 2 pools per batch for all batchs
+    B <- rep(0, length(levels(ids[[batch_col_name]])))
+    for (nbB in 1:length(levels(ids[[batch_col_name]]))) {
+        B[nbB] <- length(which(ids[which(ids[[batch_col_name]] == (levels(ids[[batch_col_name]])[nbB])), , drop = FALSE][[sample_type_col_name]] %in% sample_type_tags$pool))
+    }
+    if (length(which(B > 1)) == 0) {
+        table.check <- c(
+            table.check, "\nError: less than 2 pools specified in at least one batch in sample metadata.",
+            "\nMake sure this is not due to errors in your ", sample_type_col_name, " coding.\n"
+        )
+    }
+
+    ### Checking the unicity of samples and variables
+    uni.check <- function(tested.tab, tabtype, err.obj) {
+        unicity <- duplicated(tested.tab[, 1])
+        if (sum(unicity) > 0) {
+            # Sending back an explicit error
+            duptable <- t(t(table(tested.tab[, 1][unicity]) + 1))
+            err.obj <- c(
+                err.obj, paste0("\n-------\nError: your '", tabtype, "' IDs contain duplicates:\n"),
+                paste(rownames(duptable), duptable, sep = ": ", collapse = "\n"),
+                "\nSince identifiers are meant to be unique, please check your data.\n-------\n"
+            )
+        }
+        return(err.obj)
+    }
+    table.check <- uni.check(ids, "sample", table.check)
+    if (analyse == "batch_correction") {
+        table.check <- uni.check(metaion, "variable", table.check)
+    }
+
+    ## error check
+    check_err(table.check)
+
+
+    ### BC/DBC-specific processing
+
+    # Gathering mandatory information in a single object
+    sm.meta <- list(batch = batch_col_name, injectionOrder = injection_order_col_name, sampleType = sample_type_col_name, sampleTag = sample_type_tags)
+
+    if (analyse == "batch_correction") {
+        ## Launch
+        res <- norm_QCpool(ids, nbid, outlog, factbio, metaion, detail, FALSE, FALSE, method, span, valnull, sm.meta, min.norm)
+        ## Get back original IDs
+        var.id <- reproduce_id(res[[1]], res[[2]], "variable", var.id)
+        res[[1]] <- var.id$dataMatrix
+        res[[2]] <- var.id$Metadata
+        samp.id <- reproduce_id(res[[1]], res[[3]], "sample", samp.id)
+        res[[1]] <- samp.id$dataMatrix
+        res[[3]] <- samp.id$Metadata
+        ## Save files
+        save(res, file = rdata_output)
+        write.table(res[[1]], file = dataMatrix_out, sep = "\t", row.names = FALSE, quote = FALSE)
+        write.table(res[[2]], file = variableMetadata_out, sep = "\t", row.names = FALSE, quote = FALSE)
+    } else {
+        ## Launch
+        plotsituation(ids, nbid, out_graph_pdf, out_preNormSummary, factbio, span, sm.meta)
+    }
+} # end of meth3L
diff --git a/tools/batch_correction/batch_correction_3Lwrapper.R b/tools/batch_correction/batch_correction_3Lwrapper.R
@@ -0,0 +1,157 @@
+#!/usr/bin/env Rscript
+
+################################################################################################
+# batch_correction_wrapper                                                                     #
+#                                                                                              #
+# Authors: Marion LANDI / Jean-Francois MARTIN / Melanie Petera                                #
+# User: Galaxy                                                                                 #
+# Original data: --                                                                            #
+# Starting date: 22-07-2014                                                                    #
+# Version 1: 22-07-2014                                                                        #
+# Version 2: 08-12-2014                                                                        #
+# Version 2.1: 09-01-2015 modification in Error message of sample matching                     #
+# Version 2.2: 16-03-2015 inclusion of miniTools' functions for special characters             #
+# Version 2.90: 18-08-2015 new parameter valnull                                               #
+# Version 2.91: 25-08-2016 error message improvment                                            #
+# Version 3: 02-10-2020                                                                        #
+#            - split of tool-linked code and script-linked one                                 #
+#            - addition of args print and sessionInfo()                                        #
+#            - adjustment of sample tags' parameters to 3L methods                             #
+#            - addition of the min.norm argument in meth3L() call                              #
+#                                                                                              #
+# Input files: dataMatrix.txt, sampleMetadata.txt, variableMetadata.txt (BC only)              #
+# Output files: graph.pdf, corrected table (BC only), diagnostic table (DBC only),             #
+#               variableMetadata (BC only)                                                     #
+#                                                                                              #
+################################################################################################
+
+
+library(W4MRUtils)
+
+## ------------------------------
+## test help option
+## ------------------------------
+
+# Prog. constants
+argv.help <- commandArgs(trailingOnly = FALSE)
+script.path <- sub("--file=", "", argv.help[grep("--file=", argv.help)])
+prog.name <- basename(script.path)
+
+# Test Help
+if (length(grep("-h", argv.help)) > 0) {
+    cat(
+        "Usage: Rscript ",
+        prog.name,
+        "{args} \n",
+        "parameters: \n",
+        "\tanalyse {val}: must be set to \"batch_correction\"",
+        "\tdataMatrix {file}: set the input data matrix file (mandatory) \n",
+        "\tsampleMetadata {file}: set the input sample metadata file (mandatory) \n",
+        "\tvariableMetadata {file}: set the input variable metadata file (mandatory) \n",
+        "\tmethod {opt}: set the method; can set to \"linear\", \"lowess\" or \"loess\" (mandatory) \n",
+        "\tspan {condition}: set the span condition; set to \"none\" if method is set to \"linear\" (mandatory) \n",
+        "\tref_factor {value}: set the ref_factor value; (if span value is set to NULL, optional) \n",
+        "\tdetail {value}: set the detail value; (if span value is set to NULL, optional) \n",
+        "\tdataMatrix_out {file}: set the output data matrix file (mandatory) \n",
+        "\tvariableMetadata_out {file}: set the output variable metadata file (mandatory) \n",
+        "\tgraph_output {file}: set the output graph file (mandatory) \n",
+        "\trdata_output {file}: set the output Rdata file (mandatory) \n",
+        "\tbatch_col_name {val}: the column name for batch. Default value is \"batch\".\n",
+        "\tinjection_order_col_name {val}: the column name for the injection order. Default value is \"injectionOrder\".\n",
+        "\tsample_type_col_name {val}: the column name for the sample types. Default value is \"sampleType\".\n",
+        "\tsample_type_tags {val}: the tags used inside the sample type column, defined as key/value pairs separated by commas (example: blank=blank,pool=pool,sample=sample).\n",
+        "\n"
+    )
+    quit(status = 0)
+}
+
+## ------------------------------
+## init. params
+## ------------------------------
+
+args <- parse_args() # interpretation of arguments given in command line as an R list of objects
+
+
+cat(
+    "\nJob starting time:\n", format(Sys.time(), "%a %d %b %Y %X"),
+    "\n\n--------------------------------------------------------------------",
+    "\nParameters used:\n\n"
+)
+print(args)
+cat("--------------------------------------------------------------------\n\n")
+
+
+# Set default col names
+if (!"batch_col_name" %in% names(args)) {
+    args[["batch_col_name"]] <- "batch"
+}
+if (!"injection_order_col_name" %in% names(args)) {
+    args[["injection_order_col_name"]] <- "injectionOrder"
+}
+if (!"sample_type_col_name" %in% names(args)) {
+    args[["sample_type_col_name"]] <- "sampleType"
+}
+if (!"sample_type_tags" %in% names(args)) {
+    args[["sample_type_tags"]] <- "blank=blank,pool=pool,sample=sample"
+}
+
+# Parse sample type tags
+sample.type.tags <- list()
+for (kv in strsplit(strsplit(args$sample_type_tags, ",")[[1]], "=")) {
+    sample.type.tags[[kv[[1]]]] <- kv[-1]
+}
+if (!all(c("pool", "blank", "sample") %in% names(sample.type.tags))) {
+    stop("All tags pool, blank and sample must be defined in option sampleTypeTags.")
+}
+args$sample_type_tags <- sample.type.tags
+
+## ------------------------------
+## init. functions
+## ------------------------------
+
+source_local <- function(...) {
+    argv <- commandArgs(trailingOnly = FALSE)
+    base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))
+    for (i in 1:length(list(...))) {
+        source(paste(base_dir, list(...)[[i]], sep = "/"))
+    }
+}
+# Import the different functions
+source_local("batch_correction_3Lfct.R", "batch_correction_3Llauncher.R")
+
+# Specificities of BC and DBC
+if (args$analyse == "batch_correction") {
+    args$out_graph_pdf <- NULL
+    args$out_preNormSummary <- NULL
+} else {
+    args$variableMetadata <- NULL
+    args$rdata_output <- NULL
+    args$dataMatrix_out <- NULL
+    args$variableMetadata_out <- NULL
+    args$graph_output <- NULL
+    args$method <- NULL
+    args$detail <- NULL
+    args$valnull <- NULL
+}
+
+# Launch tool
+meth3L(
+    idsample = args$sampleMetadata, iddata = args$dataMatrix, sample_type_col_name = args$sample_type_col_name, injection_order_col_name = args$injection_order_col_name,
+    batch_col_name = args$batch_col_name, sample_type_tags = args$sample_type_tags, factbio = args$ref_factor, analyse = args$analyse, metaion = args$variableMetadata,
+    detail = args$detail, method = args$method, outlog = args$graph_output, span = args$span, valnull = args$valnull, rdata_output = args$rdata_output,
+    dataMatrix_out = args$dataMatrix_out, variableMetadata_out = args$variableMetadata_out, out_graph_pdf = args$out_graph_pdf, out_preNormSummary = args$out_preNormSummary,
+    min.norm = 1
+)
+
+
+cat(
+    "\n\n--------------------------------------------------------------------",
+    "\nInformation about R (version, Operating System, attached or loaded packages):\n\n"
+)
+sessionInfo()
+cat(
+    "--------------------------------------------------------------------\n",
+    "\nJob ending time:\n", format(Sys.time(), "%a %d %b %Y %X")
+)
+
+rm(args)