zdebruine
diff --git a/‎R/RcppExports.R
Lines changed: 79 additions & 160 deletions b/‎R/RcppExports.R
Lines changed: 79 additions & 160 deletions
diff --git a/‎R/VSE.R
Lines changed: 16 additions & 0 deletions b/‎R/VSE.R
Lines changed: 16 additions & 0 deletions
diff --git a/‎R/inspect.R
Lines changed: 15 additions & 0 deletions b/‎R/inspect.R
Lines changed: 15 additions & 0 deletions
diff --git a/‎R/redundancy.R
Lines changed: 13 additions & 0 deletions b/‎R/redundancy.R
Lines changed: 13 additions & 0 deletions
@@ -1,160 +1,79 @@
-# Generated by using Rcpp::compileAttributes() -> do not edit by hand
-# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
-
-Rcpp_predict_sparse <- function(A, mask, w, L1, L2, threads, mask_zeros) {
-    .Call(`_RcppML_Rcpp_predict_sparse`, A, mask, w, L1, L2, threads, mask_zeros)
-}
-
-Rcpp_predict_dense <- function(A_, mask, w, L1, L2, threads, mask_zeros) {
-    .Call(`_RcppML_Rcpp_predict_dense`, A_, mask, w, L1, L2, threads, mask_zeros)
-}
-
-Rcpp_mse_sparse <- function(A, mask, w, d, h, threads, mask_zeros) {
-    .Call(`_RcppML_Rcpp_mse_sparse`, A, mask, w, d, h, threads, mask_zeros)
-}
-
-Rcpp_mse_dense <- function(A_, mask, w, d, h, threads, mask_zeros) {
-    .Call(`_RcppML_Rcpp_mse_dense`, A_, mask, w, d, h, threads, mask_zeros)
-}
-
-Rcpp_mse_missing_sparse <- function(A, mask, w, d, h, threads) {
-    .Call(`_RcppML_Rcpp_mse_missing_sparse`, A, mask, w, d, h, threads)
-}
-
-Rcpp_mse_missing_dense <- function(A_, mask, w, d, h, threads) {
-    .Call(`_RcppML_Rcpp_mse_missing_dense`, A_, mask, w, d, h, threads)
-}
-
-Rcpp_nmf_sparse <- function(A, mask, tol, maxit, verbose, L1, L2, threads, w_init, link_matrix_h, mask_zeros, link_h, sort_model) {
-    .Call(`_RcppML_Rcpp_nmf_sparse`, A, mask, tol, maxit, verbose, L1, L2, threads, w_init, link_matrix_h, mask_zeros, link_h, sort_model)
-}
-
-Rcpp_nmf_dense <- function(A_, mask, tol, maxit, verbose, L1, L2, threads, w_init, link_matrix_h, mask_zeros, link_h, sort_model) {
-    .Call(`_RcppML_Rcpp_nmf_dense`, A_, mask, tol, maxit, verbose, L1, L2, threads, w_init, link_matrix_h, mask_zeros, link_h, sort_model)
-}
-
-Rcpp_bipartition_sparse <- function(A, tol, maxit, nonneg, samples, seed, verbose = FALSE, calc_dist = FALSE, diag = TRUE) {
-    .Call(`_RcppML_Rcpp_bipartition_sparse`, A, tol, maxit, nonneg, samples, seed, verbose, calc_dist, diag)
-}
-
-Rcpp_bipartition_dense <- function(A, tol, maxit, nonneg, samples, seed, verbose = FALSE, calc_dist = FALSE, diag = TRUE) {
-    .Call(`_RcppML_Rcpp_bipartition_dense`, A, tol, maxit, nonneg, samples, seed, verbose, calc_dist, diag)
-}
-
-Rcpp_dclust_sparse <- function(A, min_samples, min_dist, verbose, tol, maxit, nonneg, seed, threads) {
-    .Call(`_RcppML_Rcpp_dclust_sparse`, A, min_samples, min_dist, verbose, tol, maxit, nonneg, seed, threads)
-}
-
-#' @title Non-negative least squares
-#'
-#' @description Solves the equation \code{a %*% x = b} for \code{x} subject to \eqn{x > 0}.
-#'
-#' @details
-#' This is a very fast implementation of non-negative least squares (NNLS), suitable for very small or very large systems.
-#'
-#' **Algorithm**. Sequential coordinate descent (CD) is at the core of this implementation, and requires an initialization of \eqn{x}. There are two supported methods for initialization of \eqn{x}:
-#' 1. **Zero-filled initialization** when \code{fast_nnls = FALSE} and \code{cd_maxit > 0}. This is generally very efficient for well-conditioned and small systems.
-#' 2. **Approximation with FAST** when \code{fast_nnls = TRUE}. Forward active set tuning (FAST), described below, finds an approximate active set using unconstrained least squares solutions found by Cholesky decomposition and substitution. To use only FAST approximation, set \code{cd_maxit = 0}.
-#'
-#' \code{a} must be symmetric positive definite if FAST NNLS is used, but this is not checked.
-#'
-#' See our BioRXiv manuscript (references) for benchmarking against Lawson-Hanson NNLS and for a more technical introduction to these methods.
-#'
-#' **Coordinate Descent NNLS**. Least squares by **sequential coordinate descent** is used to ensure the solution returned is exact. This algorithm was
-#' introduced by Franc et al. (2005), and our implementation is a vectorized and optimized rendition of that found in the NNLM R package by Xihui Lin (2020).
-#'
-#' **FAST NNLS.** Forward active set tuning (FAST) is an exact or near-exact NNLS approximation initialized by an unconstrained
-#' least squares solution. Negative values in this unconstrained solution are set to zero (the "active set"), and all
-#' other values are added  to a "feasible set". An unconstrained least squares solution is then solved for the
-#' "feasible set", any negative values in the resulting solution are set to zero, and the process is repeated until
-#' the feasible set solution is strictly positive.
-#'
-#' The FAST algorithm has a definite convergence guarantee because the
-#' feasible set will either converge or become smaller with each iteration. The result is generally exact or nearly
-#' exact for small well-conditioned systems (< 50 variables) within 2 iterations and thus sets up coordinate
-#' descent for very rapid convergence. The FAST method is similar to the first phase of the so-called "TNT-NN" algorithm (Myre et al., 2017),
-#' but the latter half of that method relies heavily on heuristics to refine the approximate active set, which we avoid by using
-#' coordinate descent instead.
-#'
-#' @param a symmetric positive definite matrix giving coefficients of the linear system
-#' @param b matrix giving the right-hand side(s) of the linear system
-#' @param L1 L1/LASSO penalty to be subtracted from \code{b}
-#' @param L2 Ridge penalty to be added to diagonal of \code{a}
-#' @param PE Pattern Extraction (angular) penalty to be added to off-diagonal values of \code{a}
-#' @param fast_nnls initialize coordinate descent with a FAST NNLS approximation
-#' @param cd_maxit maximum number of coordinate descent iterations
-#' @param cd_tol stopping criteria, difference in \eqn{x} across consecutive solutions over the sum of \eqn{x}
-#' @return vector or matrix giving solution for \code{x}
-#' @export
-#' @author Zach DeBruine
-#' @seealso \code{\link{nmf}}, \code{\link{project}}
-#' @md
-#'
-#' @references
-#'
-#' DeBruine, ZJ, Melcher, K, and Triche, TJ. (2021). "High-performance non-negative matrix factorization for large single-cell data." BioRXiv.
-#'
-#' Franc, VC, Hlavac, VC, and Navara, M. (2005). "Sequential Coordinate-Wise Algorithm for the Non-negative Least Squares Problem. Proc. Int'l Conf. Computer Analysis of Images and Patterns."
-#'
-#' Lin, X, and Boutros, PC (2020). "Optimization and expansion of non-negative matrix factorization." BMC Bioinformatics.
-#'
-#' Myre, JM, Frahm, E, Lilja DJ, and Saar, MO. (2017) "TNT-NN: A Fast Active Set Method for Solving Large Non-Negative Least Squares Problems". Proc. Computer Science.
-#'
-#' @examples
-#' \dontrun{
-#' # compare solution to base::solve for a random system
-#' X <- matrix(runif(100), 10, 10)
-#' a <- crossprod(X)
-#' b <- crossprod(X, runif(10))
-#' unconstrained_soln <- solve(a, b)
-#' nonneg_soln <- nnls(a, b)
-#' unconstrained_err <- mean((a %*% unconstrained_soln - b)^2)
-#' nonnegative_err <- mean((a %*% nonneg_soln - b)^2)
-#' unconstrained_err
-#' nonnegative_err
-#' all.equal(solve(a, b), nnls(a, b))
-#'
-#' # example adapted from multiway::fnnls example 1
-#' X <- matrix(1:100,50,2)
-#' y <- matrix(101:150,50,1)
-#' beta <- solve(crossprod(X)) %*% crossprod(X, y)
-#' beta
-#' beta <- nnls(crossprod(X), crossprod(X, y))
-#' beta
-#' }
-nnls <- function(a, b, cd_maxit = 100L, cd_tol = 1e-8, fast_nnls = FALSE, L1 = 0, L2 = 0, PE = 0) {
-    .Call(`_RcppML_nnls`, a, b, cd_maxit, cd_tol, fast_nnls, L1, L2, PE)
-}
-
-c_rmatrix <- function(nrow, ncol, rng) {
-    .Call(`_RcppML_c_rmatrix`, nrow, ncol, rng)
-}
-
-c_rtimatrix <- function(nrow, ncol, rng) {
-    .Call(`_RcppML_c_rtimatrix`, nrow, ncol, rng)
-}
-
-c_runif <- function(n, min, max, rng, rng2) {
-    .Call(`_RcppML_c_runif`, n, min, max, rng, rng2)
-}
-
-c_rbinom <- function(n, size, inv_probability, rng, rng2) {
-    .Call(`_RcppML_c_rbinom`, n, size, inv_probability, rng, rng2)
-}
-
-c_sample <- function(n, size, replace, rng, rng2) {
-    .Call(`_RcppML_c_sample`, n, size, replace, rng, rng2)
-}
-
-c_rtisparsematrix <- function(nrow, ncol, inv_probability, pattern_only, rng) {
-    .Call(`_RcppML_c_rtisparsematrix`, nrow, ncol, inv_probability, pattern_only, rng)
-}
-
-c_rsparsematrix <- function(nrow, ncol, inv_probability, pattern_only, rng) {
-    .Call(`_RcppML_c_rsparsematrix`, nrow, ncol, inv_probability, pattern_only, rng)
-}
-
-Rcpp_bipartite_match <- function(x) {
-    .Call(`_RcppML_Rcpp_bipartite_match`, x)
-}
-
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#' Benchmark structure performance
+#'
+#' Measure runtime for computing column sums and sparse-dense matrix multiplication
+#'
+#' @param x sparse matrix object of class \code{dgCMatrix} of dimensions \code{m x n} with integral non-zero values
+#' @param y dense matrix object of class \code{matrix} of dimensions \code{n x k}
+#' @param n_reps number of reps
+#' @return global variables \code{bench_colsums} and \code{bench_tcrossprod} in the global environment. Use \code{RcppClock::plot} method to visualize results, or coerce to a data.frame.
+#' @export
+#' @seealso \code{\link{tcrossprod}}
+#' @examples
+#' library(Matrix)
+#' x <- rsparsematrix(50, 100, density = 0.5,
+#'   rand.x = function(n) { sample(1:10, n, replace = TRUE)})
+#' y <- matrix(runif(10 * 100), 10, 100)
+#' VSE::benchmark(x, y)
+#' plot(bench_tcrossprod)
+#' plot(bench_colsums)
+benchmark <- function(x, y, n_reps = 100L) {
+    invisible(.Call(`_VSE_benchmark`, x, y, n_reps))
+}
+
+#' Compute column sums of sparse-encoded data
+#'
+#' @param A object of class \code{dgCMatrix}
+#' @param encoding sparse encoding to be used
+#' @export
+#' @return vector of column sums
+#' @examples
+#' library(Matrix)
+#' A <- rsparsematrix(100, 1000, density = 0.5,
+#'   rand.x = function(n) { sample(1:10, n, replace = TRUE)})
+#' v <- colsums(A, encoding = "TRCSC_NP")
+#' plot(v, Matrix::colSums(A))
+colsums <- function(A, encoding = "CSC") {
+    .Call(`_VSE_colsums`, A, encoding)
+}
+
+c_inspect <- function(A, encoding = "CSC") {
+    .Call(`_VSE_c_inspect`, A, encoding)
+}
+
+#' Get memory used by C++ object
+#'
+#' Calculates size of vector container, all sub-containers, and all index and value types within the container.  This function stores all values and indices as \code{short int}.
+#'
+#' @param A object of class \code{dgCMatrix}
+#' @param encoding sparse encoding to be used
+#' @export
+#' @return size of object in bytes
+#' @examples
+#' library(Matrix)
+#' A <- rsparsematrix(100, 1000, density = 0.5,
+#'   rand.x = function(n) { sample(1:10, n, replace = TRUE)})
+#' memuse(A, "TRCSC_NP")
+memuse <- function(A, encoding = "CSC") {
+    .Call(`_VSE_memuse`, A, encoding)
+}
+
+#' Cross-product of transpose
+#'
+#' @param x sparse matrix object of class \code{dgCMatrix} of dimensions \code{m x n} with integral non-zero values
+#' @param y dense matrix object of class \code{matrix} of dimensions \code{n x k}
+#' @param encoding sparse encoding to be used
+#' @export
+#' @return dense matrix giving the transpose cross-product
+#' @examples
+#' library(Matrix)
+#' x <- rsparsematrix(100, 1000, density = 0.5,
+#'        rand.x = function(n){sample(1 : 10, n, replace = TRUE)})
+#' y <- matrix(runif(10 * 1000), 10, 1000)
+#' plot(VSE::tcrossprod(x, y), t(Matrix::tcrossprod(x, y)))
+tcrossprod <- function(x, y, encoding = "CSC") {
+    .Call(`_VSE_tcrossprod`, x, y, encoding)
+}
+
@@ -0,0 +1,16 @@
+#' VSE: Vectorized Sparse Encoding
+#'
+#' @description
+#' Proof-of-concept implementation of vectorized sparse 
+#' encodings for scalable and compressed structures for sparse data
+#'
+#' @import knitr Matrix RcppEigen RcppClock
+#' @importFrom Rcpp evalCpp
+#' @useDynLib VSE, .registration = TRUE
+#' @docType package
+#' @name VSE
+#' @author Zach DeBruine
+#' @aliases VSE-package
+#' @md
+#'
+NULL
@@ -0,0 +1,15 @@
+#' View internal structure of sparse encodings
+#' 
+#' @param A object of class \code{dgCMatrix}
+#' @param encoding sparse encoding to be used
+#' @importFrom utils str
+#' @export
+#' @return debug level output to console
+#' @examples 
+#' library(Matrix)
+#' A <- rsparsematrix(20, 5, density = 0.2,
+#'   rand.x = function(n) { sample(1:5, n, replace = TRUE)})
+#' inspect(A, "TRCSC_NP")
+inspect <- function(A, encoding = "TRCSC_NP"){
+  str(c_inspect(A, encoding)[-1])
+}
@@ -0,0 +1,13 @@
+#' Calculate redundancy of data per column
+#' 
+#' @param A sparse matrix of class \code{dgCMatrix}
+#' @export
+#' @return fractional redundancy by column
+#'
+calc_redundancy <- function(A){
+  redundancy <- c()
+  for(i in 1:ncol(A)){
+    redundancy <- c(redundancy, length(unique(A[,1])) / (A@p[[i + 1]] - A@p[[i]]))
+  }
+  1 - redundancy
+}