From ec86494c24ba34b2be2e917499dc9f446cb889a4 Mon Sep 17 00:00:00 2001
From: Dongyuan Song <dongyuansong@ucla.edu>
Date: Thu, 7 Nov 2024 19:03:35 -0800
Subject: [PATCH] Fix bugs in alternative choices of correaltion estimation

---
 DESCRIPTION         |  4 ++--
 NEWS.md             |  3 +++
 R/fit_copula.R      | 10 ++++++----
 R/fit_marginal.R    |  2 +-
 R/scdesign3.R       |  3 +++
 README.md           |  9 ++++++++-
 man/fit_copula.Rd   |  3 +++
 man/fit_marginal.Rd |  2 +-
 man/scdesign3.Rd    |  3 +++
 9 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2e33c11..506aa7e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: scDesign3
 Type: Package
 Title: A unified framework of realistic in silico data generation and statistical model inference for single-cell and spatial omics
-Version: 1.1.3
+Version: 1.1.4
 Authors@R: 
     c(person("Dongyuan", "Song", , "dongyuansong@ucla.edu", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-1114-1215")),
@@ -55,6 +55,6 @@ biocViews:
     Spatial
 URL: https://github.com/SONGDONGYUAN1994/scDesign3
 BugReports: https://github.com/SONGDONGYUAN1994/scDesign3/issues
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Config/testthat/edition: 3
 VignetteBuilder: knitr
diff --git a/NEWS.md b/NEWS.md
index 89bf24a..dca0f6f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,6 @@
+# 1.1.4 (2024-11-07)
+* Add options for correlation matrix calculation
+
 # 1.1.3 (2024-06-22)
 * Add sparse Gaussian copula for `fit_copula`
 * Add automatic k selection for `fit_marginal`
diff --git a/R/fit_copula.R b/R/fit_copula.R
index b67025d..4e62aa5 100644
--- a/R/fit_copula.R
+++ b/R/fit_copula.R
@@ -27,7 +27,7 @@
 #' The default value for is "all" (a special string which means no filtering).
 #' @param if_sparse A logic variable. Only works for Gaussian copula (\code{family_set = "gaussian"}). If TRUE, a thresholding strategy will make the corr matrix sparse. 
 #' @param n_cores An integer. The number of cores to use.
-#' @param correlation_function A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which call BLAS.
+#' @param correlation_function A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which calls BLAS.
 #' @param parallelization A string indicating the specific parallelization function to use.
 #' Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
 #' \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.
@@ -257,6 +257,7 @@ fit_copula <- function(sce,
             curr_mat,
             important_feature = important_feature,
             if_sparse = if_sparse,
+            correlation_function = correlation_function,
             lambda = 0.05,
             tol = 1e-8,
             ind = ind
@@ -367,6 +368,7 @@ fit_copula <- function(sce,
 ## Calculate the correlation matrix. If use sparse cor estimation, package spcov will be used (it can be VERY SLOW).
 cal_cor <- function(norm.mat,
                     important_feature,
+                    correlation_function = "default",
                     if_sparse = FALSE,
                     lambda = 0.05,
                     tol = 1e-8,
@@ -386,7 +388,7 @@ cal_cor <- function(norm.mat,
                                operator = 'hard', 
                                corr = TRUE)
     } else {
-      important_cor.mat <- correlation(important.mat, correlation_function = correlation_fucntion)
+      important_cor.mat <- correlation(important.mat, correlation_function = correlation_function)
     }
 
     #s_d <- apply(norm.mat, 2, stats::sd)
@@ -910,7 +912,7 @@ cal_bic <- function(norm.mat,
 }
 
 ## Similar to the cora function from "Rfast" but uses different functions to calculate column means and row sums.
-correlation <- function(x, correlation_function) {
+correlation <- function(x, correlation_function = "default") {
   if(correlation_function == "default") {
     mat <- t(x) - matrixStats::colMeans2(x)
     mat <- mat / sqrt(matrixStats::rowSums2(mat^2))
@@ -925,7 +927,7 @@ correlation <- function(x, correlation_function) {
 }
 
 ## Similar to the cova function from "Rfast" but uses different functions to calculate column means and row sums.
-covariance <- function(x) {
+covariance <- function(x, correlation_function = "default") {
   if(correlation_function == "default") {
   n <- dim(x)[1]
   m <- sqrt(n) * matrixStats::colMeans2(x)
diff --git a/R/fit_marginal.R b/R/fit_marginal.R
index b590efa..0fdfc52 100644
--- a/R/fit_marginal.R
+++ b/R/fit_marginal.R
@@ -25,7 +25,7 @@
 #' @param trace A logic variable. If TRUE, the warning/error log and runtime for gam/gamlss will be returned.
 #' will be returned, FALSE otherwise. Default is FALSE.
 #' @param simplify A logic variable. If TRUE, the fitted regression model will only keep the essential contains for \code{predict}. Default is FALSE.
-#' @param filter_cell A logic variable. If TRUE, when all covariates used for fitting the GAM/GAMLSS model are categorical, the code will check each unique combination of categories and remove cells in that category if it has all zero gene expression for each fitted gene.
+#' @param filter_cells A logic variable. If TRUE, when all covariates used for fitting the GAM/GAMLSS model are categorical, the code will check each unique combination of categories and remove cells in that category if it has all zero gene expression for each fitted gene.
 #' @return A list of fitted regression models. The length is equal to the total feature number.
 #' @examples
 #'   data(example_sce)
diff --git a/R/scdesign3.R b/R/scdesign3.R
index 8ef174f..cc99778 100644
--- a/R/scdesign3.R
+++ b/R/scdesign3.R
@@ -15,6 +15,7 @@
 #' @param family_use A string of the marginal distribution.
 #' Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian'.
 #' @param n_cores An integer. The number of cores to use.
+#' @param correlation_function A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which calls BLAS.
 #' @param usebam A logic variable. If use \code{\link[mgcv]{bam}} for acceleration in marginal fitting.
 #' @param edf_flexible A logic variable. It is used for accelerating for spatial model if k is large in 'mu_formula'. Default is FALSE.
 #' @param corr_formula A string of the correlation structure.
@@ -89,6 +90,7 @@ scdesign3 <- function(sce,
                       sigma_formula = "1",
                       family_use = "nb",
                       n_cores = 2,
+                      correlation_function = "default",
                       usebam = FALSE,
                       edf_flexible = FALSE,
                       corr_formula,
@@ -167,6 +169,7 @@ scdesign3 <- function(sce,
       copula = copula,
       family_set = family_set,
       n_cores = n_cores,
+      correlation_function = correlation_function,
       important_feature = important_feature,
       if_sparse = if_sparse,
       parallelization = parallelization,
diff --git a/README.md b/README.md
index b54ec86..96beb55 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,9 @@ To find out more details about **scDesign3**, you can check out our manuscript o
 
 [Song, D., Wang, Q., Yan, G. *et al.* scDesign3 generates realistic in silico data for multimodal single-cell and spatial omics. *Nat Biotechnol* **42**, 247–252 (2024).](https://www.nature.com/articles/s41587-023-01772-1)
 
-Please note that the parallel computing of scDesign3 is mainly designed for UNIX OS; be careful when you set `n_cores`.
+The computational time is quadratic to the number of features used in copula modeling. Reducing this number will greatly speed up the calculation.
+
+Please note that the parallel computing of scDesign3 is mainly designed for **UNIX OS**; be careful when you set `n_cores`. Please note that you should consider **the balance** between `n_cores` and your ROM (memory). Simply increasing the number of cores without the increase of memory will slow down or froze your program. We recommend that you should allocate at least 1 GB for 1 core.
 
 # Table of contents
 1. [Installation](#installation-)
@@ -49,6 +51,7 @@ example_simu <- scdesign3(
     sigma_formula = "s(pseudotime, k = 5, bs = 'cr')",
     family_use = "nb",
     n_cores = 2,
+    correlation_function = "default",
     usebam = FALSE,
     corr_formula = "1",
     copula = "gaussian",
@@ -78,6 +81,7 @@ The parameters of `scdesign3()` are:
 - `sigma_formula`: A string of the sigma parameter formula for fitting each gene's marginal distribution.
 - `family_use`: A string of the marginal distribution you want to use when fitting each gene's marginal distribution. Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian'.
 - `n_cores`: An integer. The number of cores to use.
+- `correlation_function`: A string. If 'default', the function from Rfast; if 'coop', the function from coop, which calls BLAS.
 - `usebam`: A logic variable. If TRUE, use bam (generalized additive models for very large datasets) for acceleration.
 - `edf_flexible`: A logic variable. If TRUE, the degree of freedom for each gene's regression model will be automatically selected for acceleration.
 - `corr_formula`: A string of the correlation structure. For example, if you want to obtain a correlation structure for each cell type, then this parameter should be the column name of the cell type variable in the colData of sce.
@@ -142,6 +146,9 @@ For all detailed tutorials, please check the [website](https://songdongyuan1994.
 Any questions or suggestions on `scDesign3` are welcomed! Please report it on [issues](https://github.com/SONGDONGYUAN1994/scDesign3/issues), or contact Dongyuan Song ([dongyuansong\@ucla.edu](mailto:dongyuansong@ucla.edu){.email}) or Qingyang Wang ([qw802\@g.ucla.edu](mailto:qw802@g.ucla.edu){.email}).
 
 ## Changelog
+-   2024-11-07
+    - Add options for correlation estimation. The alternative is from R package [coop](https://cran.rstudio.com/web/packages/coop/index.html), which requires BLAS
+    
 -   2024-06-22 Important changes
     - Add sparse Gaussian copula for `fit_copula`
     - Add automatic k selection for `fit_marginal`
diff --git a/man/fit_copula.Rd b/man/fit_copula.Rd
index af0b356..8b6fd28 100644
--- a/man/fit_copula.Rd
+++ b/man/fit_copula.Rd
@@ -18,6 +18,7 @@ fit_copula(
   family_set = c("gaussian", "indep"),
   important_feature = "all",
   if_sparse = FALSE,
+  correlation_function = "default",
   n_cores,
   parallelization = "mcmapply",
   BPPARAM = NULL
@@ -58,6 +59,8 @@ The default value for is "all" (a special string which means no filtering).}
 
 \item{if_sparse}{A logic variable. Only works for Gaussian copula (\code{family_set = "gaussian"}). If TRUE, a thresholding strategy will make the corr matrix sparse.}
 
+\item{correlation_function}{A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which calls BLAS.}
+
 \item{n_cores}{An integer. The number of cores to use.}
 
 \item{parallelization}{A string indicating the specific parallelization function to use.
diff --git a/man/fit_marginal.Rd b/man/fit_marginal.Rd
index 5728a47..29b2eb0 100644
--- a/man/fit_marginal.Rd
+++ b/man/fit_marginal.Rd
@@ -53,7 +53,7 @@ will be returned, FALSE otherwise. Default is FALSE.}
 
 \item{simplify}{A logic variable. If TRUE, the fitted regression model will only keep the essential contains for \code{predict}. Default is FALSE.}
 
-\item{filter_cell}{A logic variable. If TRUE, when all covariates used for fitting the GAM/GAMLSS model are categorical, the code will check each unique combination of categories and remove cells in that category if it has all zero gene expression for each fitted gene.}
+\item{filter_cells}{A logic variable. If TRUE, when all covariates used for fitting the GAM/GAMLSS model are categorical, the code will check each unique combination of categories and remove cells in that category if it has all zero gene expression for each fitted gene.}
 }
 \value{
 A list of fitted regression models. The length is equal to the total feature number.
diff --git a/man/scdesign3.Rd b/man/scdesign3.Rd
index c57a42c..cbbca02 100644
--- a/man/scdesign3.Rd
+++ b/man/scdesign3.Rd
@@ -16,6 +16,7 @@ scdesign3(
   sigma_formula = "1",
   family_use = "nb",
   n_cores = 2,
+  correlation_function = "default",
   usebam = FALSE,
   edf_flexible = FALSE,
   corr_formula,
@@ -61,6 +62,8 @@ Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian'.}
 
 \item{n_cores}{An integer. The number of cores to use.}
 
+\item{correlation_function}{A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which calls BLAS.}
+
 \item{usebam}{A logic variable. If use \code{\link[mgcv]{bam}} for acceleration in marginal fitting.}
 
 \item{edf_flexible}{A logic variable. It is used for accelerating for spatial model if k is large in 'mu_formula'. Default is FALSE.}