Merge pull request campbio#52 from joshua-d-campbell/devel

Incorporating updates to recursiveSplit
mingl1997 · Mar 19, 2019 · 6597dd0 · 6597dd0
2 parents bb0262e + f818100
commit 6597dd0
Show file tree

Hide file tree

Showing 18 changed files with 400 additions and 146 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -88,6 +88,7 @@ import(scales)
 useDynLib(celda,"_colSumByGroup")
 useDynLib(celda,"_colSumByGroupChange")
 useDynLib(celda,"_colSumByGroup_numeric")
+useDynLib(celda,"_perplexityG")
 useDynLib(celda,"_rowSumByGroup")
 useDynLib(celda,"_rowSumByGroupChange")
 useDynLib(celda,"_rowSumByGroup_numeric")
diff --git a/R/celdaGridSearch.R b/R/celdaGridSearch.R
@@ -185,7 +185,7 @@ selectBestModel = function(celda.list) {
   if (!methods::is(celda.list, "celdaList")) stop("celda.list parameter was not of class celdaList.")
 
   log_likelihood = NULL
-  group = setdiff(colnames(celda.list@run.params), c("index", "chain", "log_likelihood"))
+  group = setdiff(colnames(celda.list@run.params), c("index", "chain", "log_likelihood", "mean_perplexity"))
   dt = data.table::as.data.table(celda.list@run.params)
   new.run.params = as.data.frame(dt[,.SD[which.max(log_likelihood)], by=group])
   new.run.params = new.run.params[,colnames(celda.list@run.params)]

diff --git a/R/celda_C.R b/R/celda_C.R
@@ -431,12 +431,18 @@ cC.calcLL = function(m.CP.by.S, n.G.by.CP, s, z, K, nS, nG, alpha, beta) {
 #' @return Numeric. The log likelihood for the given cluster assignments
 #' @seealso `celda_C()` for clustering cells
 #' @examples
-#' loglik = logLikelihood(celda.C.sim$counts, model="celda_C", 
+#' loglik = logLikelihood.celda_C(celda.C.sim$counts, 
 #'                        sample.label=celda.C.sim$sample.label,
 #'                        z=celda.C.sim$z, K=celda.C.sim$K,
 #'                        alpha=celda.C.sim$alpha, beta=celda.C.sim$beta)
+#'                        
+#' loglik = logLikelihood(celda.C.sim$counts, model="celda_C",
+#'                        sample.label=celda.C.sim$sample.label,
+#'                        z=celda.C.sim$z, K=celda.C.sim$K,
+#'                        alpha=celda.C.sim$alpha, beta=celda.C.sim$beta)
+#'                        
 #' @export
-logLikelihood.celda_C = function(counts, model, sample.label, z, K, 
+logLikelihood.celda_C = function(counts, sample.label, z, K, 
                                  alpha, beta) {
   if (sum(z > K) > 0) stop("An entry in z contains a value greater than the provided K.")
   sample.label = processSampleLabels(sample.label, ncol(counts))
@@ -533,7 +539,7 @@ setMethod("clusterProbability",
 setMethod("perplexity",
           signature(celda.mod = "celda_C"),
           function(counts, celda.mod, new.counts=NULL) {
-            compareCountMatrix(counts, celda.mod)
+
             if (!("celda_C" %in% class(celda.mod))) { 
               stop("The celda.mod provided was not of class celda_C.")
             }

diff --git a/R/celda_CG.R b/R/celda_CG.R
@@ -526,12 +526,20 @@ cCG.calcLL = function(K, L, m.CP.by.S, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, nS
 #' @return The log likelihood for the given cluster assignments
 #' @seealso `celda_CG()` for clustering features and cells
 #' @examples
-#' loglik = logLikelihood(celda.CG.sim$counts, model="celda_CG", 
+#' loglik = logLikelihood.celda_CG(celda.CG.sim$counts, 
 #'                        sample.label=celda.CG.sim$sample.label,
 #'                        z=celda.CG.sim$z, y=celda.CG.sim$y,
 #'                        K=celda.CG.sim$K, L=celda.CG.sim$L,
 #'                        alpha=celda.CG.sim$alpha, beta=celda.CG.sim$beta,
 #'                        gamma=celda.CG.sim$gamma, delta=celda.CG.sim$delta)
+#'                        
+#' loglik = logLikelihood(celda.CG.sim$counts, model="celda_CG",
+#'                        sample.label=celda.CG.sim$sample.label,
+#'                        z=celda.CG.sim$z, y=celda.CG.sim$y,
+#'                        K=celda.CG.sim$K, L=celda.CG.sim$L,
+#'                        alpha=celda.CG.sim$alpha, beta=celda.CG.sim$beta,
+#'                        gamma=celda.CG.sim$gamma, delta=celda.CG.sim$delta)
+#'                        
 #' @export
 logLikelihood.celda_CG = function(counts, sample.label, z, y, K, L, alpha, beta, delta, gamma) {  
   if (sum(z > K) > 0) stop("An entry in z contains a value greater than the provided K.")

diff --git a/R/celda_G.R b/R/celda_G.R
@@ -419,10 +419,16 @@ cG.calcLL = function(n.TS.by.C, n.by.TS, n.by.G, nG.by.TS, nM, nG, L, beta, delt
 #' @return The log-likelihood for the given cluster assignments
 #' @seealso `celda_G()` for clustering features
 #' @examples
-#' loglik = logLikelihood(celda.G.sim$counts, model="celda_G", 
+#' loglik = logLikelihood.celda_G(celda.G.sim$counts, 
 #'                        y=celda.G.sim$y, L=celda.G.sim$L,
 #'                        beta=celda.G.sim$beta, delta=celda.G.sim$delta,
 #'                        gamma=celda.G.sim$gamma)
+#'                        
+#' loglik = logLikelihood(celda.G.sim$counts, model="celda_G",
+#'                        y=celda.G.sim$y, L=celda.G.sim$L,
+#'                        beta=celda.G.sim$beta, delta=celda.G.sim$delta,
+#'                        gamma=celda.G.sim$gamma)
+#'                        
 #' @export
 logLikelihood.celda_G = function(counts, y, L, beta, delta, gamma) {
   if (sum(y > L) > 0) stop("An entry in y contains a value greater than the provided L.")
@@ -533,15 +539,15 @@ setMethod("perplexity",
 
             factorized = factorizeMatrix(counts = counts, celda.mod = celda.mod, 
                                          type=c("posterior", "counts"))
-            phi <- factorized$posterior$module
-            psi <- factorized$posterior$cell
+            psi <- factorized$posterior$module
+            phi <- factorized$posterior$cell
             eta <- factorized$posterior$gene.distribution
             nG.by.TS = factorized$counts$gene.distribution
 
             eta.prob = log(eta) * nG.by.TS
-            gene.by.cell.prob = log(phi %*% psi) 
-            log.px = sum(gene.by.cell.prob * new.counts) # + sum(eta.prob) 
-
+#            gene.by.cell.prob = log(psi %*% phi) 
+#            log.px = sum(gene.by.cell.prob * new.counts) # + sum(eta.prob) 
+            log.px = perplexityG_logPx(new.counts, phi, psi, celda.mod@clusters$y, celda.mod@params$L)# + sum(eta.prob) 
             perplexity = exp(-(log.px/sum(new.counts)))
             return(perplexity)
           })

diff --git a/R/initialize_clusters.R b/R/initialize_clusters.R
@@ -69,6 +69,8 @@ initialize.splitZ = function(counts, K, K.subcluster=NULL, alpha=1, beta=1, min.
     z.ta = tabulate(overall.z, max(overall.z))
     z.to.split = sample(which(z.ta > min.cell & z.ta > K.to.use))
 
+    if(length(z.to.split) == 0) break()
+
     ## Cycle through each splitable cluster and split it up into K.sublcusters
     for(i in z.to.split) {
 
@@ -138,7 +140,8 @@ initialize.splitZ = function(counts, K, K.subcluster=NULL, alpha=1, beta=1, min.
     m.CP.by.S = p$m.CP.by.S[-z.to.remove,,drop=FALSE]
     overall.z = as.integer(as.factor(overall.z))        
     current.K = current.K - 1
-  }  
+
+  } 
   return(overall.z)
 }
 
@@ -165,7 +168,9 @@ initialize.splitY = function(counts, L, L.subcluster=NULL, temp.K=100, beta=1, d
     ## Determine which clusters are split-able
     y.ta = tabulate(overall.y, max(overall.y))
     y.to.split = sample(which(y.ta > min.feature & y.ta > L.subcluster))
-
+
+    if(length(y.to.split) == 0) break()
+
     ## Cycle through each splitable cluster and split it up into L.sublcusters
     for(i in y.to.split) {
 

diff --git a/R/matrixSums.R b/R/matrixSums.R
@@ -42,3 +42,10 @@ colSumByGroup.numeric <- function(x, group, K) {
   res <- .Call("_colSumByGroup_numeric", x, group)
   return(res)
 }
+
+#' @useDynLib celda _perplexityG
+perplexityG_logPx <- function(x, phi, psi, group, L) {
+  group <- factor(group, levels=1:L)
+  res <- .Call("_perplexityG", x, phi, psi, group)
+  return(res)
+}
diff --git a/R/model_performance.R b/R/model_performance.R
@@ -22,19 +22,20 @@ resamplePerplexity <- function(counts, celda.list, resample=5, seed=12345) {
   if (!isTRUE(is.numeric(resample))) stop("Provided resample parameter was not numeric.")
 
   setSeed(seed)
-  countsList = lapply(1:resample,
-                      function(i){
-                        resampleCountMatrix(counts)
-                      })
-
+
   perp.res = matrix(NA, nrow=length(celda.list@res.list), ncol=resample)
-  for(i in 1:length(celda.list@res.list)) {
-    for(j in 1:resample) {
-      perp.res[i,j] = perplexity(counts, celda.list@res.list[[i]], countsList[[j]])
+  for(j in 1:resample) {
+    new.counts = resampleCountMatrix(counts)
+    for(i in 1:length(celda.list@res.list)) {      
+      perp.res[i,j] = perplexity(counts, celda.list@res.list[[i]], new.counts)
     }
   }
   celda.list@perplexity = perp.res
 
+  ## Add mean perplexity to run.params
+  perp.mean = apply(perp.res, 1, mean)
+  celda.list@run.params$mean_perplexity=perp.mean
+
   return(celda.list)
 }
 
@@ -86,14 +87,25 @@ plotGridSearchPerplexity.celda_CG = function(celda.list) {
   l.means.by.k$K = as.factor(l.means.by.k$K)
   l.means.by.k$L = as.factor(l.means.by.k$L)
 
-  plot = ggplot2::ggplot(df, ggplot2::aes_string(x="K", y="perplexity")) +
-  		ggplot2::geom_jitter(height=0, width=0.1, ggplot2::aes_string(color="L")) +
-        ggplot2::scale_color_discrete(name="L") +
-        ggplot2::geom_path(data=l.means.by.k, 
-                           ggplot2::aes_string(x="K", y="mean_perplexity", group="L", color="L")) +
-        ggplot2::ylab("Perplexity") +
-        ggplot2::xlab("K") +
-        ggplot2::theme_bw()
+  if(nlevels(df$K) > 1) {
+    plot = ggplot2::ggplot(df, ggplot2::aes_string(x="K", y="perplexity")) +
+      ggplot2::geom_jitter(height=0, width=0.1, ggplot2::aes_string(color="L")) +
+      ggplot2::scale_color_discrete(name="L") +
+      ggplot2::geom_path(data=l.means.by.k, 
+                         ggplot2::aes_string(x="K", y="mean_perplexity", group="L", color="L")) +
+      ggplot2::ylab("Perplexity") +
+      ggplot2::xlab("K") +
+      ggplot2::theme_bw()
+  } else {
+    plot = ggplot2::ggplot(df, ggplot2::aes_string(x="L", y="perplexity")) +
+      ggplot2::geom_jitter(height=0, width=0.1, ggplot2::aes_string(color="K")) +
+      ggplot2::scale_color_discrete(name="K") +
+      ggplot2::geom_path(data=l.means.by.k, 
+                         ggplot2::aes_string(x="L", y="mean_perplexity", group="K", color="K")) +
+      ggplot2::ylab("Perplexity") +
+      ggplot2::xlab("L") +
+      ggplot2::theme_bw()
+  }
 
   return(plot)
 }