Add different strategies for NB parameter estimation; add cell weight…

…s to differential expression test
satijalab · Jun 27, 2018 · 5ef3509 · 5ef3509
1 parent 3e90f5e
commit 5ef3509
Show file tree

Hide file tree

Showing 20 changed files with 210 additions and 165 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,7 @@ export(smooth_via_pca)
 export(vst)
 import(Matrix)
 import(parallel)
+importFrom(MASS,glm.nb)
 importFrom(MASS,negative.binomial)
 importFrom(MASS,theta.ml)
 importFrom(graphics,abline)

diff --git a/R/differential_expression.R b/R/differential_expression.R
@@ -33,10 +33,11 @@ compare_expression <- function(x, umi, group, val1, val2, method = 'LRT', bin_si
   use_cells <- c(sel1, sel2)
   cell_attr <- cell_attr[use_cells, ]
   group <- factor(c(rep(0, length(sel1)), rep(1, length(sel2))))
-  genes <- rownames(x$model_pars_fit)
+  weights <- c(rep(1/length(sel1), length(sel1)), rep(1/length(sel2), length(sel2)))
+  genes <- rownames(x$model_pars_fit)[rownames(x$model_pars_fit) %in% rownames(umi)]
   cells_group1 <- rowSums(umi[genes, sel1] > 0)
   cells_group2 <- rowSums(umi[genes, sel2] > 0)
-  genes <- rownames(x$model_pars_fit)[cells_group1 >= min_cells | cells_group2 >= min_cells]
+  genes <- genes[cells_group1 >= min_cells | cells_group2 >= min_cells]
 
   regressor_data <- model.matrix(as.formula(gsub('^y', '', x$model_str)), cell_attr)
   # process genes in batches
@@ -56,7 +57,7 @@ compare_expression <- function(x, umi, group, val1, val2, method = 'LRT', bin_si
       mu <- x$model_pars_fit[genes_bin, -1, drop=FALSE] %*% t(regressor_data)  # in log space
       y <- as.matrix(umi[genes_bin, use_cells])
       bin_res <- mclapply(genes_bin, function(gene) {
-        model_comparison_lrt(y[gene, ], mu[gene, ], x$model_pars_fit[gene, 'theta'], group)
+        model_comparison_lrt(y[gene, ], mu[gene, ], x$model_pars_fit[gene, 'theta'], group, weights)
       })
     }
     res[[i]] <- do.call(rbind, bin_res)
@@ -77,10 +78,10 @@ compare_expression <- function(x, umi, group, val1, val2, method = 'LRT', bin_si
 
 #' @importFrom stats glm offset anova
 #' @importFrom MASS negative.binomial
-model_comparison_lrt <- function(y, offs, theta, group) {
+model_comparison_lrt <- function(y, offs, theta, group, weights = NULL) {
   fam <- negative.binomial(theta = theta)
-  mod0 <- glm(y ~ 1 + offset(offs), family = fam)
-  mod1 <- glm(y ~ 1 + offset(offs) + group, family = fam)
+  mod0 <- glm(y ~ 1 + offset(offs), family = fam, weights = weights)
+  mod1 <- glm(y ~ 1 + offset(offs) + group, family = fam, weights = weights)
   p_val <- anova(mod0, mod1, test = 'LRT')$'Pr(>Chi)'[2]
   fold_change <- log2(exp(mod1$coefficients[2]))
   return(c(p_val, fold_change))

diff --git a/R/plotting.R b/R/plotting.R
@@ -20,7 +20,6 @@ plot_model_pars <- function(vst_out) {
     geom_point(data=df, size=0.5, alpha=0.5, shape=16) +
     geom_point(data=df_fit, size=0.66, alpha=0.5, shape=16) +
     facet_wrap(~ parameter, scales = 'free_y') +
-    theme_gray(base_size = 8) +
     theme(legend.position='bottom')
   return(g)
 }
@@ -75,14 +74,16 @@ get_nb_fit <- function(x, umi, gene, cell_attr, as_poisson = FALSE) {
 #' @param as_poisson Fix model parameter theta to Inf, effectively showing a Poisson model
 #' @param arrange_vertical Stack individual ggplot objects or place side by side
 #' @param show_density Draw 2D density lines over points
+#' @param gg_cmds Additional ggplot layer commands
 #'
 #' @return A ggplot object
 #'
 #' @export
 #'
 plot_model <- function(x, umi, goi, x_var = x$arguments$latent_var[1], cell_attr = x$cell_attr,
                        do_log = TRUE, show_fit = TRUE, show_nr = FALSE, plot_residual = FALSE,
-                       batches = NULL, as_poisson = FALSE, arrange_vertical = TRUE, show_density = TRUE) {
+                       batches = NULL, as_poisson = FALSE, arrange_vertical = TRUE, show_density = TRUE,
+                       gg_cmds = NULL) {
   require('ggplot2')
   require('gridExtra')
   if (is.null(batches)) {
@@ -138,7 +139,7 @@ plot_model <- function(x, umi, goi, x_var = x$arguments$latent_var[1], cell_attr
   if (do_log) {
     g <- g + ylab('Gene log10(UMI + 1)')
   }
-  g <- g + theme_gray(base_size = 8)
+  g <- g + gg_cmds
   if (length(goi) == 1) {
     g <- g + theme(strip.text = element_blank())
   }
@@ -149,7 +150,7 @@ plot_model <- function(x, umi, goi, x_var = x$arguments$latent_var[1], cell_attr
     g2 <- ggplot(df, aes(x, res)) + geom_point(alpha = 0.5, shape=16) +
       coord_cartesian(ylim = res_range) +
       facet_grid(~gene) + xlab(x) + ylab('Pearson residual') +
-      xlab(paste('Cell', x_var)) + theme_gray(base_size = 8) +
+      xlab(paste('Cell', x_var)) + gg_cmds +
       theme(strip.text = element_blank()) # strip.background = element_blank(),
     if (show_density) {
       g2 <- g2 + geom_density_2d(color = 'lightblue', size=0.5)
@@ -158,7 +159,7 @@ plot_model <- function(x, umi, goi, x_var = x$arguments$latent_var[1], cell_attr
       g3 <- ggplot(df, aes(x, res_nr)) + geom_point(alpha = 0.5, shape=16) +
         coord_cartesian(ylim = res_range) +
         facet_grid(~gene) + xlab(x) + ylab('Pearson residual non-reg.') +
-        xlab(paste('Cell', x_var)) + theme_gray(base_size = 8) +
+        xlab(paste('Cell', x_var)) + gg_cmds +
         theme(strip.text = element_blank()) # strip.background = element_blank(),
       if (show_density) {
         g3 <- g3 + geom_density_2d(color = 'lightblue', size=0.5)

diff --git a/R/utils.R b/R/utils.R
@@ -17,7 +17,11 @@ is_outlier <- function(y, x, n, th = 40) {
   bins <- cut(rank(x, ties.method = 'first'), n.bins, ordered_result = TRUE)
   tmp <- aggregate(x = y, by = list(bin=bins), FUN = robust_scale)
   score <- rep(0, length(x))
-  score[o] <- as.numeric(t(tmp$x))
+  if (class(tmp$x) == 'list') {
+    score[o] <- unlist(tmp$x)
+  } else {
+    score[o] <- as.numeric(t(tmp$x))
+  }
   return(score > th)
 }
 

diff --git a/R/vst.R b/R/vst.R
@@ -11,6 +11,7 @@
 #' @param latent_var The dependent variables to regress out as a character vector; must match column names in cell_attr; default is c("log_umi_per_gene")
 #' @param batch_var The dependent variables indicating which batch a cell belongs to; no batch interaction terms used if omiited
 #' @param n_genes Number of genes to use when estimating parameters (default uses 2000 genes, set to NULL to use all genes)
+#' @param method Method to use for initial parameter estimation; one of 'poisson', 'nb_fast', 'nb'
 #' @param res_clip_range Numeric of length two specifying the min and max values the results will be clipped to
 #' @param bin_size Number of genes to put in each bin (to show progress)
 #' @param min_cells Only use genes that have been detected in at least this many cells
@@ -29,7 +30,7 @@
 #'
 #' @import Matrix
 #' @import parallel
-#' @importFrom MASS theta.ml
+#' @importFrom MASS theta.ml glm.nb negative.binomial
 #' @importFrom stats glm ksmooth model.matrix as.formula approx density poisson var
 #' @importFrom utils txtProgressBar setTxtProgressBar
 #'
@@ -43,6 +44,7 @@ vst <- function(umi,
                 latent_var = c('log_umi_per_gene'),
                 batch_var = NULL,
                 n_genes = 2000,
+                method = 'poisson',
                 res_clip_range = c(-50, 50),
                 bin_size = 256,
                 min_cells = 5,
@@ -106,9 +108,31 @@ vst <- function(umi,
         X = genes_bin_regress,
         FUN = function(j) {
           y <- umi_bin[j, ]
-          fit <- glm(as.formula(model_str), data = cell_attr, family = poisson)
-          theta <- as.numeric(x = theta.ml(y = y, mu = fit$fitted))
-          return(c(theta, fit$coefficients))
+          if (method == 'poisson') {
+            fit <- glm(as.formula(model_str), data = cell_attr, family = poisson)
+            theta <- as.numeric(x = theta.ml(y = y, mu = fit$fitted))
+            return(c(theta, fit$coefficients))
+          }
+          if (method == 'nb_fast') {
+            fit <- glm(as.formula(model_str), data = cell_attr, family = poisson)
+            theta <- as.numeric(x = theta.ml(y = y, mu = fit$fitted))
+            fit2 <- 0
+            try(fit2 <- glm(as.formula(model_str), data = cell_attr, family = negative.binomial(theta=theta)), silent=TRUE)
+            if (class(fit2)[1] == 'numeric') {
+              return(c(theta, fit$coefficients))
+            } else {
+              return(c(theta, fit2$coefficients))
+            }
+          }
+          if (method == 'nb') {
+            fit <- 0
+            try(fit <- glm.nb(as.formula(model_str), data = cell_attr), silent=TRUE)
+            if (class(fit)[1] == 'numeric') {
+              fit <- glm(as.formula(model_str), data = cell_attr, family = poisson)
+              fit$theta <- as.numeric(x = theta.ml(y = y, mu = fit$fitted))
+            }
+            return(c(fit$theta, fit$coefficients))
+          }
         }
       )
     )

diff --git a/README.md b/README.md
@@ -1,5 +1,9 @@
 # sctransform
-R package for single cell expression data transformation and normalization
+R package for modeling single cell UMI expression data using regularized negative binomial regression
+
+This packaged was developed by Christoph Hafemeister in [Rahul Satija's lab](https://satijalab.org/) at the New York Genome Center. A previous version of this work was used in the paper [Developmental diversification of cortical inhibitory interneurons, Nature 555, 2018](https://github.com/ChristophH/in-lineage) and is, in part, also implemented in [Seurat](https://satijalab.org/seurat/), an R package designed for QC, analysis, and exploration of single cell RNA-seq data.
+
+This package is in beta status, please sanity check any results, and notify me of any issues you find.
 
 ## Quick start
 `devtools::install_github(repo = 'ChristophH/sctransform')`  

diff --git a/inst/doc/batch_correction.R b/inst/doc/batch_correction.R
@@ -1,16 +1,18 @@
 ## ----setup, include = FALSE----------------------------------------------
+library('Matrix')
+library('ggplot2')
+library('reshape2')
+library('sctransform')
+library('knitr')
+knit_hooks$set(optipng = hook_optipng)
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>",
   digits = 2,
+  optipng = '-o7',
   fig.width=5, fig.height=3, dpi=100, out.width = '70%'
 )
-
-## ----load_packages, include=FALSE----------------------------------------
-library('Matrix')
-library('ggplot2')
-library('reshape2')
-library('sctransform')
+old_theme <- theme_set(theme_classic(base_size=8))
 
 ## ------------------------------------------------------------------------
 # some of the vst steps can use multiple cores, set number here

diff --git a/inst/doc/batch_correction.Rmd b/inst/doc/batch_correction.Rmd
@@ -10,19 +10,20 @@ vignette: >
 ---
 
 ```{r setup, include = FALSE}
+library('Matrix')
+library('ggplot2')
+library('reshape2')
+library('sctransform')
+library('knitr')
+knit_hooks$set(optipng = hook_optipng)
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>",
   digits = 2,
+  optipng = '-o7',
   fig.width=5, fig.height=3, dpi=100, out.width = '70%'
 )
-```
-
-```{r load_packages, include=FALSE}
-library('Matrix')
-library('ggplot2')
-library('reshape2')
-library('sctransform')
+old_theme <- theme_set(theme_classic(base_size=8))
 ```
 
 In this vignette we show how the regression model in the variance stabilizing transformation can also be used to mitigate batch effects. We use data from [Shekhar et al., Cell 2016](https://www.ncbi.nlm.nih.gov/pubmed/27565351) to demonstrate the functionality. The original analysis and data are available [here](https://github.com/broadinstitute/BipolarCell2016). 

diff --git a/inst/doc/denoising.R b/inst/doc/denoising.R
@@ -1,16 +1,18 @@
 ## ----setup, include = FALSE----------------------------------------------
+library('Matrix')
+library('ggplot2')
+library('reshape2')
+library('sctransform')
+library('knitr')
+knit_hooks$set(optipng = hook_optipng)
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>",
   digits = 2,
+  optipng = '-o7',
   fig.width=6, fig.height=4, dpi=100, out.width = '70%'
 )
-
-## ----load_packages, include=FALSE----------------------------------------
-library('Matrix')
-library('ggplot2')
-library('reshape2')
-library('sctransform')
+old_theme <- theme_set(theme_classic(base_size=8))
 
 ## ------------------------------------------------------------------------
 options(mc.cores = 4)

diff --git a/inst/doc/denoising.Rmd b/inst/doc/denoising.Rmd
@@ -10,19 +10,20 @@ vignette: >
 ---
 
 ```{r setup, include = FALSE}
+library('Matrix')
+library('ggplot2')
+library('reshape2')
+library('sctransform')
+library('knitr')
+knit_hooks$set(optipng = hook_optipng)
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>",
   digits = 2,
+  optipng = '-o7',
   fig.width=6, fig.height=4, dpi=100, out.width = '70%'
 )
-```
-
-```{r load_packages, include=FALSE}
-library('Matrix')
-library('ggplot2')
-library('reshape2')
-library('sctransform')
+old_theme <- theme_set(theme_classic(base_size=8))
 ```
 
 In this vignette we show how the regression model in the variance stabilizing transformation can be used to output de-noised data. Implicitly there are two levels of smoothing/de-noising when we apply the standard workflow using `vst`. First we specify latent variables that are used in the regression model - their contribution to the overall variance in the data will be removed. Second, we usually perform dimensionality reduction, which acts like a smoothing operation. Here we show how to reverse these two operations to obtain de-noised UMI counts.

diff --git a/inst/doc/differential_expression.R b/inst/doc/differential_expression.R
@@ -1,19 +1,24 @@
 ## ----setup, include = FALSE----------------------------------------------
+library('Matrix')
+library('ggplot2')
+library('reshape2')
+library('sctransform')
+library('knitr')
+knit_hooks$set(optipng = hook_optipng)
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>",
   digits = 2,
+  optipng = '-o7',
   fig.width=4, fig.height=2.5, dpi=100, out.width = '70%'
 )
-
-## ----load_packages, include=FALSE----------------------------------------
-library('Matrix')
-library('ggplot2')
-library('reshape2')
-library('sctransform')
+old_theme <- theme_set(theme_classic(base_size=10))
 
 ## ----load_data-----------------------------------------------------------
+pbmc_clusters <- readRDS(file = "~/Projects/data/pbmc3k_celltypes.rds")
 pbmc_data <- readRDS(file = "~/Projects/data/pbmc3k_umi_counts.Rds")
+pbmc_data <- pbmc_data[, names(pbmc_clusters)]
+
 class(pbmc_data)
 dim(pbmc_data)
 
@@ -23,14 +28,10 @@ set.seed(43)
 vst_out <- sctransform::vst(pbmc_data, latent_var = c('log_umi_per_gene'), return_gene_attr = TRUE, return_cell_attr = TRUE, show_progress = FALSE)
 
 ## ------------------------------------------------------------------------
-var_genes <- rownames(vst_out$gene_attr)[scale(vst_out$gene_attr$residual_variance)[, 1] > 1]
-pca <- irlba::prcomp_irlba(t(vst_out$y[var_genes, ]), n = 20)
-ce <- pca$x[, 1:10]
-hcl <- hclust(dist(ce), method='ward.D2')
-clustering <- cutree(hcl, 10)
-
-## ------------------------------------------------------------------------
-res1 <- sctransform::compare_expression(x = vst_out, umi = pbmc_data, group = clustering, val1 = 1, val2 = 2, show_progress = FALSE)
+res1 <- sctransform::compare_expression(x = vst_out, umi = pbmc_data, group = pbmc_clusters, 
+                                        val1 = 'CD14+ Monocytes', 
+                                        val2 = 'FCGR3A+ Monocytes', 
+                                        show_progress = FALSE)
 
 ## ------------------------------------------------------------------------
 head(subset(res1, log_fc < 0), 10)
@@ -40,15 +41,16 @@ head(subset(res1, log_fc > 0), 10)
 ggplot(res1, aes(log_fc, -log10(p_value))) + geom_point(alpha=0.4, shape=16)
 
 ## ------------------------------------------------------------------------
-res2 <- sctransform::compare_expression(x = vst_out, umi = pbmc_data, group = clustering, val1 = setdiff(clustering, 3), val2 = 3, show_progress = FALSE)
+res2 <- sctransform::compare_expression(x = vst_out, umi = pbmc_data, group = pbmc_clusters, val1 = setdiff(pbmc_clusters, 'CD8 T cells'), val2 = 'CD8 T cells', show_progress = FALSE)
+head(subset(res2, log_fc > 0), 10)
 
 ## ---- fig.width=4, fig.height=4, out.width='49%', fig.show='hold'--------
 goi <- rownames(subset(res2, log_fc > 0))[1:3]
 df <- melt(t(as.matrix(pbmc_data[goi, ])), varnames = c('cell', 'gene'), value.name = 'UMI')
-df$cluster <- clustering
-ggplot(df, aes(x = gene, y = log10(UMI + 1), color = cluster == 3)) + geom_violin(scale = 'width')
+df$cluster <- pbmc_clusters
+ggplot(df, aes(x = gene, y = log10(UMI + 1), color = cluster == 'CD8 T cells')) + geom_violin(scale = 'width')
 df <- melt(t(as.matrix(vst_out$y[goi, ])), varnames = c('cell', 'gene'), value.name = 'Pearson_residual')
-df$cluster <- clustering
-ggplot(df, aes(x = gene, y = Pearson_residual, color = cluster == 3)) + geom_violin(scale = 'width')
+df$cluster <- pbmc_clusters
+ggplot(df, aes(x = gene, y = Pearson_residual, color = cluster == 'CD8 T cells')) + geom_violin(scale = 'width')