add splitModule, a function to manually split feature module

mingl1997 · Jun 17, 2020 · f9b287b · f9b287b
1 parent 0198b73
commit f9b287b
Show file tree

Hide file tree

Showing 5 changed files with 185 additions and 6 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -66,6 +66,7 @@ export(sampleLabel)
 export(selectBestModel)
 export(simulateCells)
 export(simulateContamination)
+export(splitModule)
 export(subsetCeldaList)
 export(topRank)
 exportMethods("celdaClusters<-")
@@ -115,6 +116,7 @@ exportMethods(resamplePerplexity)
 exportMethods(runParams)
 exportMethods(sampleLabel)
 exportMethods(selectBestModel)
+exportMethods(splitModule)
 exportMethods(subsetCeldaList)
 import(Rcpp)
 import(RcppEigen)

diff --git a/R/celda_G.R b/R/celda_G.R
@@ -7,7 +7,7 @@
 #'  Rows represent features and columns represent cells.
 #' @param useAssay A string specifying which \link[SummarizedExperiment]{assay}
 #'  slot to use if \code{x} is a
-#'  \link[SingleCellExperiment]{SingleCellExperiment} object. Default "counts".
+#'  \linkS4class{SingleCellExperiment} object. Default "counts".
 #' @param L Integer. Number of feature modules.
 #' @param beta Numeric. Concentration parameter for Phi. Adds a pseudocount to
 #'  each feature module in each cell. Default 1.
@@ -45,8 +45,10 @@
 #' @param logfile Character. Messages will be redirected to a file named
 #'  `logfile`. If NULL, messages will be printed to stdout.  Default NULL.
 #' @param verbose Logical. Whether to print log messages. Default TRUE.
-#' @return An object of class `celda_G` with the feature module clusters stored
-#'  in `y`.
+#' @return A \linkS4class{SingleCellExperiment} object. Function
+#'  parameter settings are stored in the \link[S4Vectors]{metadata}
+#'  \code{"celda_parameters"} slot. Column \code{celda_feature_module} in
+#'  \link[SummarizedExperiment]{rowData} contains feature modules.
 #' @seealso \link{celda_C} for cell clustering and \link{celda_CG} for
 #'  simultaneous clustering of features and cells. \link{celdaGridSearch} can
 #'  be used to run multiple values of L and multiple chains in parallel.

diff --git a/R/splitModule.R b/R/splitModule.R
@@ -0,0 +1,130 @@
+#' @title Split celda feature module
+#' @description Manually select a celda feature module to split into 2 or
+#'  more modules. Useful for splitting up modules that show divergent
+#'  expression of features in multiple cell clusters.
+#' @param x A \linkS4class{SingleCellExperiment} object
+#'  with the matrix located in the assay slot under \code{useAssay}.
+#'  Rows represent features and columns represent cells.
+#' @param useAssay A string specifying which \link[SummarizedExperiment]{assay}
+#'  slot to use for \code{x}. Default "counts".
+#' @param module Integer. The module to be split.
+#' @param n Integer. How many modules should \code{module} be split into.
+#'  Default 2.
+#' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility,
+#'  a default value of 12345 is used. If NULL, no calls to
+#'  \link[withr]{with_seed} are made.
+#' @return A updated \linkS4class{SingleCellExperiment} object with new
+#'  feature modules stored in column \code{celda_feature_module} in
+#'  \code{\link[SummarizedExperiment]{rowData}(x)}.
+#' @export
+setGeneric("splitModule",
+    function(x, ...) {
+        standardGeneric("splitModule")
+    })
+
+
+#' @rdname splitModule
+#' @examples
+#' data(sceCeldaCG)
+#' # Split module 5 into 2 new modules.
+#' sce <- splitModule(sceCeldaCG, module = 5)
+#' @export
+setMethod("splitModule", signature(x = "SingleCellExperiment"),
+    function(x,
+        useAssay = "counts",
+        module,
+        n = 2,
+        seed = 12345) {
+
+        if (!module %in% celdaClusters(x)) {
+            stop("Module ", module, " is not found in celdaClusters(x).",
+                " Please specify a valid module.")
+        }
+
+        celdaGMod <- .splitModuleWithSeed(x = x,
+            useAssay = useAssay,
+            module = module,
+            n = n,
+            seed = seed)
+
+        S4Vectors::metadata(x)[["celda_parameters"]]$L <- params(model)$L
+        S4Vectors::metadata(x)[["celda_parameters"]]$finalLogLik <-
+            model@finalLogLik
+        S4Vectors::metadata(x)[["celda_parameters"]]$featureModuleLevels <-
+            sort(unique(celdaClusters(celdaGMod)$y))
+        SummarizedExperiment::rowData(x)["celda_feature_module"] <-
+            celdaClusters(celdaGMod)$y
+        return(x)
+    }
+)
+
+
+.splitModuleWithSeed <- function(x,
+    useAssay,
+    module,
+    n,
+    seed) {
+
+    if (is.null(seed)) {
+        celdaGMod <- .splitModule(x, useAssay, module, n)
+    } else {
+        with_seed(seed, celdaGMod <- .splitModule(x, useAssay, module, n))
+    }
+    return(celdaGMod)
+}
+
+
+.splitModule <- function(x, useAssay, module, n) {
+    counts <- SummarizedExperiment::assay(x, i = useAssay)
+    .validateCounts(counts)
+    counts <- as.matrix(counts)
+    ix <- celdaModules(x) == module
+
+    if (sum(ix) > 1) {
+        tempModel <- .celda_G(
+            counts = counts[ix, , drop = FALSE],
+            L = n,
+            yInitialize = "random",
+            splitOnIter = -1,
+            splitOnLast = FALSE,
+            nchains = 1,
+            verbose = FALSE)
+
+        splitY <- celdaClusters(tempModel)$y
+        splitIx <- celdaClusters(tempModel)$y > 1
+        splitY[splitIx] <- S4Vectors::metadata(x)$celda_parameters$L +
+            splitY[splitIx] - 1
+        splitY[!splitIx] <- module
+
+        newY <- celdaModules(x)
+        newY[ix] <- splitY
+        newL <- max(newY)
+
+        newLl <- .logLikelihoodcelda_G(
+            counts = counts,
+            y = newY,
+            L = newL,
+            beta = S4Vectors::metadata(x)$celda_parameters$beta,
+            delta = S4Vectors::metadata(x)$celda_parameters$delta,
+            gamma = S4Vectors::metadata(x)$celda_parameters$gamma)
+        model <- methods::new(
+            "celda_G",
+            clusters = list(y = newY),
+            params = list(
+                L = newL,
+                beta = S4Vectors::metadata(x)$celda_parameters$beta,
+                delta = S4Vectors::metadata(x)$celda_parameters$delta,
+                gamma = S4Vectors::metadata(x)$celda_parameters$gamma,
+                countChecksum = .createCountChecksum(counts)
+            ),
+            names = list(row = rownames(x),
+                column = colnames(x),
+                sample = x@metadata$celda_parameters$sampleLevels),
+            finalLogLik = newLl
+        )
+    } else {
+        stop("Module ", module, "contains <= 1 feature. No additional",
+            " splitting was able to be performed.")
+    }
+    return(model)
+}
diff --git a/man/celda_G.Rd b/man/celda_G.Rd
diff --git a/man/splitModule.Rd b/man/splitModule.Rd