From ec86494c24ba34b2be2e917499dc9f446cb889a4 Mon Sep 17 00:00:00 2001 From: Dongyuan Song Date: Thu, 7 Nov 2024 19:03:35 -0800 Subject: [PATCH] Fix bugs in alternative choices of correaltion estimation --- DESCRIPTION | 4 ++-- NEWS.md | 3 +++ R/fit_copula.R | 10 ++++++---- R/fit_marginal.R | 2 +- R/scdesign3.R | 3 +++ README.md | 9 ++++++++- man/fit_copula.Rd | 3 +++ man/fit_marginal.Rd | 2 +- man/scdesign3.Rd | 3 +++ 9 files changed, 30 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2e33c11..506aa7e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: scDesign3 Type: Package Title: A unified framework of realistic in silico data generation and statistical model inference for single-cell and spatial omics -Version: 1.1.3 +Version: 1.1.4 Authors@R: c(person("Dongyuan", "Song", , "dongyuansong@ucla.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-1114-1215")), @@ -55,6 +55,6 @@ biocViews: Spatial URL: https://github.com/SONGDONGYUAN1994/scDesign3 BugReports: https://github.com/SONGDONGYUAN1994/scDesign3/issues -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Config/testthat/edition: 3 VignetteBuilder: knitr diff --git a/NEWS.md b/NEWS.md index 89bf24a..dca0f6f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +# 1.1.4 (2024-11-07) +* Add options for correlation matrix calculation + # 1.1.3 (2024-06-22) * Add sparse Gaussian copula for `fit_copula` * Add automatic k selection for `fit_marginal` diff --git a/R/fit_copula.R b/R/fit_copula.R index b67025d..4e62aa5 100644 --- a/R/fit_copula.R +++ b/R/fit_copula.R @@ -27,7 +27,7 @@ #' The default value for is "all" (a special string which means no filtering). #' @param if_sparse A logic variable. Only works for Gaussian copula (\code{family_set = "gaussian"}). If TRUE, a thresholding strategy will make the corr matrix sparse. #' @param n_cores An integer. The number of cores to use. -#' @param correlation_function A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which call BLAS. +#' @param correlation_function A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which calls BLAS. #' @param parallelization A string indicating the specific parallelization function to use. #' Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package #' \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'. @@ -257,6 +257,7 @@ fit_copula <- function(sce, curr_mat, important_feature = important_feature, if_sparse = if_sparse, + correlation_function = correlation_function, lambda = 0.05, tol = 1e-8, ind = ind @@ -367,6 +368,7 @@ fit_copula <- function(sce, ## Calculate the correlation matrix. If use sparse cor estimation, package spcov will be used (it can be VERY SLOW). cal_cor <- function(norm.mat, important_feature, + correlation_function = "default", if_sparse = FALSE, lambda = 0.05, tol = 1e-8, @@ -386,7 +388,7 @@ cal_cor <- function(norm.mat, operator = 'hard', corr = TRUE) } else { - important_cor.mat <- correlation(important.mat, correlation_function = correlation_fucntion) + important_cor.mat <- correlation(important.mat, correlation_function = correlation_function) } #s_d <- apply(norm.mat, 2, stats::sd) @@ -910,7 +912,7 @@ cal_bic <- function(norm.mat, } ## Similar to the cora function from "Rfast" but uses different functions to calculate column means and row sums. -correlation <- function(x, correlation_function) { +correlation <- function(x, correlation_function = "default") { if(correlation_function == "default") { mat <- t(x) - matrixStats::colMeans2(x) mat <- mat / sqrt(matrixStats::rowSums2(mat^2)) @@ -925,7 +927,7 @@ correlation <- function(x, correlation_function) { } ## Similar to the cova function from "Rfast" but uses different functions to calculate column means and row sums. -covariance <- function(x) { +covariance <- function(x, correlation_function = "default") { if(correlation_function == "default") { n <- dim(x)[1] m <- sqrt(n) * matrixStats::colMeans2(x) diff --git a/R/fit_marginal.R b/R/fit_marginal.R index b590efa..0fdfc52 100644 --- a/R/fit_marginal.R +++ b/R/fit_marginal.R @@ -25,7 +25,7 @@ #' @param trace A logic variable. If TRUE, the warning/error log and runtime for gam/gamlss will be returned. #' will be returned, FALSE otherwise. Default is FALSE. #' @param simplify A logic variable. If TRUE, the fitted regression model will only keep the essential contains for \code{predict}. Default is FALSE. -#' @param filter_cell A logic variable. If TRUE, when all covariates used for fitting the GAM/GAMLSS model are categorical, the code will check each unique combination of categories and remove cells in that category if it has all zero gene expression for each fitted gene. +#' @param filter_cells A logic variable. If TRUE, when all covariates used for fitting the GAM/GAMLSS model are categorical, the code will check each unique combination of categories and remove cells in that category if it has all zero gene expression for each fitted gene. #' @return A list of fitted regression models. The length is equal to the total feature number. #' @examples #' data(example_sce) diff --git a/R/scdesign3.R b/R/scdesign3.R index 8ef174f..cc99778 100644 --- a/R/scdesign3.R +++ b/R/scdesign3.R @@ -15,6 +15,7 @@ #' @param family_use A string of the marginal distribution. #' Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian'. #' @param n_cores An integer. The number of cores to use. +#' @param correlation_function A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which calls BLAS. #' @param usebam A logic variable. If use \code{\link[mgcv]{bam}} for acceleration in marginal fitting. #' @param edf_flexible A logic variable. It is used for accelerating for spatial model if k is large in 'mu_formula'. Default is FALSE. #' @param corr_formula A string of the correlation structure. @@ -89,6 +90,7 @@ scdesign3 <- function(sce, sigma_formula = "1", family_use = "nb", n_cores = 2, + correlation_function = "default", usebam = FALSE, edf_flexible = FALSE, corr_formula, @@ -167,6 +169,7 @@ scdesign3 <- function(sce, copula = copula, family_set = family_set, n_cores = n_cores, + correlation_function = correlation_function, important_feature = important_feature, if_sparse = if_sparse, parallelization = parallelization, diff --git a/README.md b/README.md index b54ec86..96beb55 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,9 @@ To find out more details about **scDesign3**, you can check out our manuscript o [Song, D., Wang, Q., Yan, G. *et al.* scDesign3 generates realistic in silico data for multimodal single-cell and spatial omics. *Nat Biotechnol* **42**, 247–252 (2024).](https://www.nature.com/articles/s41587-023-01772-1) -Please note that the parallel computing of scDesign3 is mainly designed for UNIX OS; be careful when you set `n_cores`. +The computational time is quadratic to the number of features used in copula modeling. Reducing this number will greatly speed up the calculation. + +Please note that the parallel computing of scDesign3 is mainly designed for **UNIX OS**; be careful when you set `n_cores`. Please note that you should consider **the balance** between `n_cores` and your ROM (memory). Simply increasing the number of cores without the increase of memory will slow down or froze your program. We recommend that you should allocate at least 1 GB for 1 core. # Table of contents 1. [Installation](#installation-) @@ -49,6 +51,7 @@ example_simu <- scdesign3( sigma_formula = "s(pseudotime, k = 5, bs = 'cr')", family_use = "nb", n_cores = 2, + correlation_function = "default", usebam = FALSE, corr_formula = "1", copula = "gaussian", @@ -78,6 +81,7 @@ The parameters of `scdesign3()` are: - `sigma_formula`: A string of the sigma parameter formula for fitting each gene's marginal distribution. - `family_use`: A string of the marginal distribution you want to use when fitting each gene's marginal distribution. Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian'. - `n_cores`: An integer. The number of cores to use. +- `correlation_function`: A string. If 'default', the function from Rfast; if 'coop', the function from coop, which calls BLAS. - `usebam`: A logic variable. If TRUE, use bam (generalized additive models for very large datasets) for acceleration. - `edf_flexible`: A logic variable. If TRUE, the degree of freedom for each gene's regression model will be automatically selected for acceleration. - `corr_formula`: A string of the correlation structure. For example, if you want to obtain a correlation structure for each cell type, then this parameter should be the column name of the cell type variable in the colData of sce. @@ -142,6 +146,9 @@ For all detailed tutorials, please check the [website](https://songdongyuan1994. Any questions or suggestions on `scDesign3` are welcomed! Please report it on [issues](https://github.com/SONGDONGYUAN1994/scDesign3/issues), or contact Dongyuan Song ([dongyuansong\@ucla.edu](mailto:dongyuansong@ucla.edu){.email}) or Qingyang Wang ([qw802\@g.ucla.edu](mailto:qw802@g.ucla.edu){.email}). ## Changelog +- 2024-11-07 + - Add options for correlation estimation. The alternative is from R package [coop](https://cran.rstudio.com/web/packages/coop/index.html), which requires BLAS + - 2024-06-22 Important changes - Add sparse Gaussian copula for `fit_copula` - Add automatic k selection for `fit_marginal` diff --git a/man/fit_copula.Rd b/man/fit_copula.Rd index af0b356..8b6fd28 100644 --- a/man/fit_copula.Rd +++ b/man/fit_copula.Rd @@ -18,6 +18,7 @@ fit_copula( family_set = c("gaussian", "indep"), important_feature = "all", if_sparse = FALSE, + correlation_function = "default", n_cores, parallelization = "mcmapply", BPPARAM = NULL @@ -58,6 +59,8 @@ The default value for is "all" (a special string which means no filtering).} \item{if_sparse}{A logic variable. Only works for Gaussian copula (\code{family_set = "gaussian"}). If TRUE, a thresholding strategy will make the corr matrix sparse.} +\item{correlation_function}{A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which calls BLAS.} + \item{n_cores}{An integer. The number of cores to use.} \item{parallelization}{A string indicating the specific parallelization function to use. diff --git a/man/fit_marginal.Rd b/man/fit_marginal.Rd index 5728a47..29b2eb0 100644 --- a/man/fit_marginal.Rd +++ b/man/fit_marginal.Rd @@ -53,7 +53,7 @@ will be returned, FALSE otherwise. Default is FALSE.} \item{simplify}{A logic variable. If TRUE, the fitted regression model will only keep the essential contains for \code{predict}. Default is FALSE.} -\item{filter_cell}{A logic variable. If TRUE, when all covariates used for fitting the GAM/GAMLSS model are categorical, the code will check each unique combination of categories and remove cells in that category if it has all zero gene expression for each fitted gene.} +\item{filter_cells}{A logic variable. If TRUE, when all covariates used for fitting the GAM/GAMLSS model are categorical, the code will check each unique combination of categories and remove cells in that category if it has all zero gene expression for each fitted gene.} } \value{ A list of fitted regression models. The length is equal to the total feature number. diff --git a/man/scdesign3.Rd b/man/scdesign3.Rd index c57a42c..cbbca02 100644 --- a/man/scdesign3.Rd +++ b/man/scdesign3.Rd @@ -16,6 +16,7 @@ scdesign3( sigma_formula = "1", family_use = "nb", n_cores = 2, + correlation_function = "default", usebam = FALSE, edf_flexible = FALSE, corr_formula, @@ -61,6 +62,8 @@ Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian'.} \item{n_cores}{An integer. The number of cores to use.} +\item{correlation_function}{A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which calls BLAS.} + \item{usebam}{A logic variable. If use \code{\link[mgcv]{bam}} for acceleration in marginal fitting.} \item{edf_flexible}{A logic variable. It is used for accelerating for spatial model if k is large in 'mu_formula'. Default is FALSE.}