add names of three new function to NAMESPACE and three Rd files in ma…

…n folder Former-commit-id: c511bbc
mingl1997 · Dec 7, 2017 · 7452d26 · 7452d26
1 parent 68f986e
commit 7452d26
Show file tree

Hide file tree

Showing 100 changed files with 416 additions and 154 deletions.
diff --git a/CONDUCT.md b/CONDUCT.md
diff --git a/LICENSE b/LICENSE
diff --git a/LICENSE.pheatmap b/LICENSE.pheatmap
diff --git a/NAMESPACE b/NAMESPACE
@@ -21,6 +21,7 @@ S3method(simulateCells,celda_G)
 S3method(visualizeModelPerformance,celda_C)
 S3method(visualizeModelPerformance,celda_CG)
 S3method(visualizeModelPerformance,celda_G)
+export(GiniPlot)
 export(available_models)
 export(calculateLoglikFromVariables)
 export(calculatePerplexity)
@@ -37,6 +38,7 @@ export(clusterProbability.celda_G)
 export(compareCountMatrix)
 export(completeClusterHistory)
 export(completeLogLikelihood)
+export(diffExp_MAST)
 export(distinct_colors)
 export(factorizeMatrix)
 export(finalClusterAssignment)
@@ -56,8 +58,10 @@ export(renderInteractiveKLPlot)
 export(renderIterationLikelihoodPlot)
 export(runParams)
 export(seed)
+export(selectModels)
 export(semi_pheatmap)
 export(simulateCells)
+export(stateHeatmap)
 export(topRank)
 export(visualizeModelPerformance)
 import(RColorBrewer)

diff --git a/R/StateHeatmap.R b/R/StateHeatmap.R
diff --git a/R/celda.R b/R/celda.R
@@ -19,7 +19,7 @@ available_models = c("celda_C", "celda_G", "celda_CG")
 #' @return Object of class "celda_list", which contains results for all model parameter combinations and summaries of the run parameters
 #' @import foreach
 #' @export
-celda = function(counts, model, sample.label=NULL, nchains=1, cores=1, seed=12345, verbose=TRUE, logfile_prefix="Celda", ...) {
+celda = function(counts, model, sample.label=NULL, nchains=1, cores=1, seed=12345, verbose=FALSE, logfile_prefix="Celda", ...) {
   message("Starting celda...")
   validateArgs(counts, model, sample.label, nchains, cores, seed, ...)
 

diff --git a/R/celda_C.R b/R/celda_C.R
@@ -204,18 +204,18 @@ celda_C = function(counts, sample.label=NULL, K, alpha=1, beta=1,
     iter = iter + 1    
   }
 
-  reordered.labels = reorder.label.by.size(z.best, K)
-  z.final.reorder = reordered.labels$new.labels
   names = list(row=rownames(counts), column=colnames(counts), sample=levels(sample.label))
 
-  result = list(z=z.final.reorder, completeLogLik=ll,  
+  result = list(z=z.best, completeLogLik=ll,  
                 finalLogLik=ll.best, seed=seed, K=K, 
                 sample.label=sample.label, alpha=alpha, 
                 beta=beta, count.checksum=count.checksum, 
                 names=names)
 
   class(result) = "celda_C"
 
+  result = reorder.celdaC(counts = counts, res = result)
+
   return(result)
 }
 
@@ -351,6 +351,16 @@ cC.calcLL = function(m.CP.by.S, n.CP.by.G, s, z, K, nS, alpha, beta) {
   return(final)
 }
 
+reorder.celdaC = function(counts,res){
+  #Reorder K
+  fm <- factorizeMatrix(counts = counts, celda.mod = res)
+  fm.norm <- t(normalizeCounts(t(fm$proportions$gene.states),scale.factor = 1))
+  d <- dist(t(fm.norm),diag = TRUE, upper = TRUE)
+  h <- hclust(d, method = "complete")
+  res <- recodeClusterZ(res,from = h$order,
+                        to = c(1:ncol(fm$counts$gene.states)))
+  return(res)
+}
 
 #' Generate factorized matrices showing each feature's influence on the celda_C model clustering 
 #' 

diff --git a/R/celda_CG.R b/R/celda_CG.R
@@ -93,7 +93,7 @@ calculateLoglikFromVariables.celda_CG = function(counts, s, z, y, K, L, alpha, b
   return(final)
 }
 
-cCG.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) {
+.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) {
 
   ## Determine if any TS has 0 genes
   ## Need to remove 0 gene states as this will cause the likelihood to fail
@@ -142,7 +142,7 @@ cCG.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS
   return(final)
 }
 
-cCG.calcGibbsProbZ = function(m.CP.by.S, n.CP.by.TS, n.CP, L, alpha, beta) {
+.calcGibbsProbZ = function(m.CP.by.S, n.CP.by.TS, n.CP, L, alpha, beta) {
 
   ## Calculate for "Theta" component
   theta.ll = log(m.CP.by.S + alpha)
@@ -158,7 +158,7 @@ cCG.calcGibbsProbZ = function(m.CP.by.S, n.CP.by.TS, n.CP, L, alpha, beta) {
   return(final)
 }
 
-cCG.calcGibbsProbY = function(n.CP.by.TS, n.by.TS, nG.by.TS, nG.in.Y, beta, delta, gamma) {
+.calcGibbsProbY = function(n.CP.by.TS, n.by.TS, nG.by.TS, nG.in.Y, beta, delta, gamma) {
 
   ## Calculate for "Phi" component
   phi.ll = sum(lgamma(n.CP.by.TS + beta))
@@ -176,6 +176,25 @@ cCG.calcGibbsProbY = function(n.CP.by.TS, n.by.TS, nG.by.TS, nG.in.Y, beta, delt
   return(final)
 }
 
+reorder.celdaCG = function(counts,res){
+  #Reorder K
+  fm <- factorizeMatrix(counts = counts, celda.mod = res)
+  fm.norm <- t(normalizeCounts(t(fm$proportions$population.states),scale.factor = 1))
+  d <- dist(t(fm.norm),diag = TRUE, upper = TRUE)
+  h <- hclust(d, method = "complete")
+  res <- recodeClusterZ(res,from = h$order,
+                        to = c(1:ncol(fm$counts$population.states)))
+
+  #Reorder L
+  fm <- factorizeMatrix(counts = counts, celda.mod = res)
+  fm.norm <- t(normalizeCounts(t(fm$proportions$population.states),scale.factor = 1))
+  d <- dist((fm.norm),diag = TRUE, upper = TRUE)
+  h <- hclust(d, method = "complete")
+  res <- recodeClusterY(res,from = h$order,
+                        to = c(1:nrow(fm$counts$population.states)))
+  return(res)
+}
+
 #' Simulate cells from the cell/gene clustering generative model
 #' 
 #' @param S The number of samples
@@ -246,8 +265,8 @@ simulateCells.celda_CG = function(model, S=10, C.Range=c(50,100), N.Range=c(500,
   zero.row.idx = which(rowSums(cell.counts) == 0)
   if (length(zero.row.idx > 0)) {
     cell.counts = cell.counts[-zero.row.idx, ]
-  }
-  new$y = new$y[-zero.row.idx]
+    new$y = new$y[-zero.row.idx]
+  } 
 
   ## Assign gene/cell/sample names 
   rownames(cell.counts) = paste0("Gene_", 1:nrow(cell.counts))
@@ -332,7 +351,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1,
   nG = nrow(counts)
   nM = ncol(counts)
 
-  ll = cCG.calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
+  ll = .calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
 
   iter = 1
   continue = TRUE
@@ -358,7 +377,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1,
         temp.n.CP = n.CP
         temp.n.CP[j] = temp.n.CP[j] + sum(counts[,i])
 
-        probs[j] = cCG.calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta)
+        probs[j] = .calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta)
       }  
 
       ## Sample next state and add back counts
@@ -410,7 +429,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1,
         temp.nG.by.TS = nG.by.TS + (1 * ADD_PSEUDO)
         temp.nG.by.TS[j] = temp.nG.by.TS[j] + 1
 
-        probs[j] = cCG.calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS,
+        probs[j] = .calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS,
 			n.by.TS=temp.n.by.TS,
 			nG.by.TS=temp.nG.by.TS,
 			nG.in.Y=temp.nG.by.TS[j],
@@ -481,7 +500,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1,
    }
 
     ## Calculate complete likelihood
-    temp.ll = cCG.calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
+    temp.ll = .calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
     if((all(temp.ll > ll)) | iter == 1) {
       z.best = z
       y.best = y
@@ -493,21 +512,21 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1,
     iter = iter + 1    
   }
 
-  ## Peform reordering on final Z and Y assigments:
-  reordered.labels = reorder.labels.by.size.then.counts(counts, z=z.best, 
-                                                        y=y.best, K=K, L=L)
+
   names = list(row=rownames(counts), column=colnames(counts), 
                sample=levels(sample.label))
 
 
-  result = list(z=reordered.labels$z, y=reordered.labels$y, completeLogLik=ll, 
+  result = list(z=z.best, y=y.best, completeLogLik=ll, 
                 finalLogLik=ll.best, K=K, L=L, alpha=alpha, 
                 beta=beta, delta=delta, gamma=gamma, seed=seed, 
                 sample.label=sample.label, names=names,
                 count.checksum=count.checksum)
 
   class(result) = "celda_CG" 
 
+  ## Peform reordering on final Z and Y assigments:
+  result = reorder.celdaCG(counts = counts, res = result)
   return(result)
 }
 
@@ -637,7 +656,7 @@ clusterProbability.celda_CG = function(counts, celda.mod) {
 	  temp.n.CP = n.CP
 	  temp.n.CP[j] = temp.n.CP[j] + sum(counts[,i])
 
-	  z.prob[i,j] = cCG.calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta)
+	  z.prob[i,j] = .calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta)
 	}  
 
 	m.CP.by.S[z[i],s[i]] = m.CP.by.S[z[i],s[i]] + 1
@@ -667,7 +686,7 @@ clusterProbability.celda_CG = function(counts, celda.mod) {
 	  temp.nG.by.TS = nG.by.TS + (1 * ADD_PSEUDO)
 	  temp.nG.by.TS[j] = temp.nG.by.TS[j] + 1
 
-	  y.prob[i,j] = cCG.calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS,
+	  y.prob[i,j] = .calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS,
 		  n.by.TS=temp.n.by.TS,
 		  nG.by.TS=temp.nG.by.TS,
 		  nG.in.Y=temp.nG.by.TS[j],

diff --git a/R/celda_G.R b/R/celda_G.R
@@ -123,7 +123,16 @@ cG.calcLL = function(n.TS.by.C, n.by.TS, n.by.G, nG.by.TS, nM, nG, L, beta, delt
   return(final)
 }
 
-
+reorder.celdaG = function(counts,res){
+  #Reorder L
+  fm <- factorizeMatrix(counts = counts, celda.mod = res)
+  fm.norm <- t(normalizeCounts(fm$proportions$gene.states,scale.factor = 1))
+  d <- dist((fm.norm),diag = TRUE, upper = TRUE)
+  h <- hclust(d, method = "complete")
+  res <- recodeClusterY(res,from = h$order,
+                        to = c(1:nrow(fm$counts$cell.states)))
+  return(res)
+}
 
 
 # Calculate Log Likelihood For Single Set of Cluster Assignments (Gene Clustering)
@@ -297,16 +306,16 @@ celda_G = function(counts, L, beta=1, delta=1, gamma=1, max.iter=25,
     iter <- iter + 1    
   }
 
-  reordered.labels = reorder.label.by.size(y.best, L)
-  y.final.order = reordered.labels$new.labels
   names = list(row=rownames(counts), column=colnames(counts))  
 
-  result = list(y=y.final.order, completeLogLik=ll, 
+  result = list(y=y.best, completeLogLik=ll, 
                 finalLogLik=ll.best, L=L, beta=beta, delta=delta, gamma=gamma,
                 count.checksum=count.checksum, seed=seed, names=names)
 
   class(result) = "celda_G"
 
+  result = reorder.celdaG(counts = counts, res = result)
+
   return(result)
 }
 
@@ -361,8 +370,10 @@ simulateCells.celda_G = function(model, C=100, N.Range=c(500,5000),  G=1000,
   ## Ensure that there are no all-0 rows in the counts matrix, which violates a celda modeling
   ## constraint (columns are guarnteed at least one count):
   zero.row.idx = which(rowSums(cell.counts) == 0)
-  cell.counts = cell.counts[-zero.row.idx, ]
-  y = y[-zero.row.idx]
+  if (length(zero.row.idx > 0)) {
+    cell.counts = cell.counts[-zero.row.idx, ]
+    y = y[-zero.row.idx]
+  }
 
   rownames(cell.counts) = paste0("Gene_", 1:nrow(cell.counts))
   colnames(cell.counts) = paste0("Cell_", 1:ncol(cell.counts))

diff --git a/R/celda_functions.R b/R/celda_functions.R
diff --git a/R/celda_heatmap.R b/R/celda_heatmap.R
diff --git a/R/celda_list.R b/R/celda_list.R
@@ -2,25 +2,26 @@
 # S3 Methods for celda_list objects                                            #
 ################################################################################
 
-#' Run the celda Bayesian hierarchical model on a matrix of counts.
-#' TODO: - If no chain provided, automatically choose best chain
-#'       - Smarter subsetting of the run.params to DRY this function up
+# TODO: - If no chain provided, automatically choose best chain
+#       - Smarter subsetting of the run.params to DRY this function up
+
+#' Select the best model amongst a list of models generated in a celda run.
 #' 
 #' @param celda.list A celda_list object returned from celda()
-#' @param K The K parameter for the desired model in the results list
-#' @param L The L parameter for the desired model in the results list
-#' @param chain The desired chain for the specified model
-#' @param best Method for choosing best chain automatically. Options are c("perplexity", "loglik"). See documentation for chooseBestModel for details. Overrides chain parameter if provided.
+#' @param K The K parameter for the desired model in the results list. Defaults to NULL.
+#' @param L The L parameter for the desired model in the results list. Defaults to NULL.
+#' @param chain The desired chain for the specified model. Defaults to NULL. Overrides best parameter if provided.
+#' @param best Method for choosing best chain automatically. Options are c("perplexity", "loglik"). See documentation for chooseBestModel for details. Defaults to "loglik."
 #' @return A celda model object matching the provided parameters (of class "celda_C", "celda_G", "celda_CG" accordingly), or NA if one is not found.
 #' @export
-getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) {
+getModel = function(celda.list, K=NULL, L=NULL, chain=NULL, best="loglik") {
   validateGetModelParams(celda.list, K, L, chain, best)  # Sanity check params
 
   requested.chain = NA
   run.params = celda.list$run.params
 
   if (celda.list$content.type == "celda_CG") {
-    if (!is.null(best)) {
+    if (is.null(chain)) {
       matching.chain.idx = run.params[run.params$K == K & run.params$L == L, "index"]
       requested.chain = chooseBestChain(celda.list$res.list[matching.chain.idx], best)
     } else {
@@ -32,7 +33,7 @@ getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) {
 
 
   if (celda.list$content.type == "celda_C") {
-    if (!is.null(best)) {
+    if (is.null(chain)) {
       matching.chain.idx = run.params[run.params$K == K, "index"]
       requested.chain = chooseBestChain(celda.list$res.list[matching.chain.idx], best)
     } else {
@@ -43,7 +44,7 @@ getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) {
 
 
   if (celda.list$content.type == "celda_G") {
-    if (!is.null(best)) {
+    if (is.null(chain)) {
       matching.chain.idx = run.params[run.params$L == L, "index"]
       requested.chain = chooseBestChain(celda.list$res.list[matching.chain.idx], best)
     } else { 
@@ -62,6 +63,30 @@ getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) {
 }
 
 
+#' Select a set of models from a celda_list objects based off of rows in its run.params attribute.
+#' 
+#' @param celda.list A celda_list object returned from celda()
+#' @param run.param.rows Row indices in the celda.list's run params corresponding to the desired models.
+#' @return A celda_list containing celda model objects matching the provided parameters (of class "celda_C", "celda_G", "celda_CG" accordingly), or NA if one is not found.
+#' @export
+selectModels = function(celda.list, run.param.rows) {
+  desired.models = lapply(run.param.rows,
+                          function(row.idx) {
+                            if (!is.numeric(row.idx) | 
+                                row.idx < 0 | 
+                                row.idx > nrow(celda.list$run.params)) {
+                                  stop("Invalid row index provided") 
+                            }
+                            params = celda.list$run.params[row.idx, ]
+                            getModel(celda.list, params$K, params$L, params$chain)
+                          })
+  subsetted.celda.list = list(run.params=celda.list$run.params[run.param.rows, ],
+                              res.list=desired.models, content.type=celda.list$content.type,
+                              count.checksum=celda.list$count.checksum)
+  return(subsetted.celda.list)
+}
+
+
 validateGetModelParams = function(celda.list, K, L, chain, best) {
   if (class(celda.list) != "celda_list") stop("First argument to getModel() should be an object of class 'celda_list'")
 

diff --git a/R/data.R b/R/data.R
diff --git a/R/diffExp.R b/R/diffExp.R
diff --git a/R/feature_selection.R b/R/feature_selection.R
diff --git a/R/model_performance.R b/R/model_performance.R
diff --git a/R/plot_dr.R b/R/plot_dr.R
@@ -17,8 +17,8 @@ plotDrGrid <- function(dim1, dim2, matrix, size, xlab, ylab, color_low, color_mi
   colnames(m) <- c(xlab,ylab,"facet",var_label)
   ggplot2::ggplot(m, ggplot2::aes_string(x=xlab, y=ylab)) + ggplot2::geom_point(stat = "identity", size = size, ggplot2::aes_string(color = var_label)) + 
     ggplot2::facet_wrap(~facet) + ggplot2::theme_bw() + ggplot2::scale_colour_gradient2(low = color_low, high = color_high, mid = color_mid, midpoint = (max(m[,4])-min(m[,4]))/2 ,name = gsub("_"," ",var_label)) + 
-    ggplot2::theme(strip.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.spacing = unit(0,"lines"),
-                   panel.background = element_blank(), axis.line = ggplot2::element_line(colour = "black"))
+    ggplot2::theme(strip.background = ggplot2::element_blank(), panel.grid.major = ggplot2::element_blank(), panel.grid.minor = ggplot2::element_blank(), panel.spacing = unit(0,"lines"),
+                   panel.background = ggplot2::element_blank(), axis.line = ggplot2::element_line(colour = "black"))
 }
 
 #' Create a scatterplot for each row of a normalized gene expression matrix where x and y axis are from a data dimensionality reduction tool.  

diff --git a/R/s3_generics.R b/R/s3_generics.R
diff --git a/R/semi_pheatmap.R b/R/semi_pheatmap.R
diff --git a/R/split_clusters.R b/R/split_clusters.R
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ An analysis example using celda with RNASeq via vignette('celda-analysis')
 
 
 ## New Features and announcements
-The v0.2 release of celda represents a useable implementation of the various celda clustering models.
+The v0.3 release of celda represents a useable implementation of the various celda clustering models.
 Please submit any usability issues or bugs to the issue tracker at https://github.com/compbiomed/celda
 
 You can discuss celda, or ask the developers usage questions, in our [Google Group.](https://groups.google.com/forum/#!forum/celda-list)
diff --git a/data/pbmc_data.rda.REMOVED.git-id b/data/pbmc_data.rda.REMOVED.git-id
diff --git a/data/sample.cells.rda b/data/sample.cells.rda
diff --git a/extdata/test.png b/extdata/test.png