Skip to content

Commit

Permalink
Add different strategies for NB parameter estimation; add cell weight…
Browse files Browse the repository at this point in the history
…s to differential expression test
  • Loading branch information
ChristophH committed Jun 27, 2018
1 parent 3e90f5e commit 5ef3509
Show file tree
Hide file tree
Showing 20 changed files with 210 additions and 165 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export(smooth_via_pca)
export(vst)
import(Matrix)
import(parallel)
importFrom(MASS,glm.nb)
importFrom(MASS,negative.binomial)
importFrom(MASS,theta.ml)
importFrom(graphics,abline)
Expand Down
13 changes: 7 additions & 6 deletions R/differential_expression.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@ compare_expression <- function(x, umi, group, val1, val2, method = 'LRT', bin_si
use_cells <- c(sel1, sel2)
cell_attr <- cell_attr[use_cells, ]
group <- factor(c(rep(0, length(sel1)), rep(1, length(sel2))))
genes <- rownames(x$model_pars_fit)
weights <- c(rep(1/length(sel1), length(sel1)), rep(1/length(sel2), length(sel2)))
genes <- rownames(x$model_pars_fit)[rownames(x$model_pars_fit) %in% rownames(umi)]
cells_group1 <- rowSums(umi[genes, sel1] > 0)
cells_group2 <- rowSums(umi[genes, sel2] > 0)
genes <- rownames(x$model_pars_fit)[cells_group1 >= min_cells | cells_group2 >= min_cells]
genes <- genes[cells_group1 >= min_cells | cells_group2 >= min_cells]

regressor_data <- model.matrix(as.formula(gsub('^y', '', x$model_str)), cell_attr)
# process genes in batches
Expand All @@ -56,7 +57,7 @@ compare_expression <- function(x, umi, group, val1, val2, method = 'LRT', bin_si
mu <- x$model_pars_fit[genes_bin, -1, drop=FALSE] %*% t(regressor_data) # in log space
y <- as.matrix(umi[genes_bin, use_cells])
bin_res <- mclapply(genes_bin, function(gene) {
model_comparison_lrt(y[gene, ], mu[gene, ], x$model_pars_fit[gene, 'theta'], group)
model_comparison_lrt(y[gene, ], mu[gene, ], x$model_pars_fit[gene, 'theta'], group, weights)
})
}
res[[i]] <- do.call(rbind, bin_res)
Expand All @@ -77,10 +78,10 @@ compare_expression <- function(x, umi, group, val1, val2, method = 'LRT', bin_si

#' @importFrom stats glm offset anova
#' @importFrom MASS negative.binomial
model_comparison_lrt <- function(y, offs, theta, group) {
model_comparison_lrt <- function(y, offs, theta, group, weights = NULL) {
fam <- negative.binomial(theta = theta)
mod0 <- glm(y ~ 1 + offset(offs), family = fam)
mod1 <- glm(y ~ 1 + offset(offs) + group, family = fam)
mod0 <- glm(y ~ 1 + offset(offs), family = fam, weights = weights)
mod1 <- glm(y ~ 1 + offset(offs) + group, family = fam, weights = weights)
p_val <- anova(mod0, mod1, test = 'LRT')$'Pr(>Chi)'[2]
fold_change <- log2(exp(mod1$coefficients[2]))
return(c(p_val, fold_change))
Expand Down
11 changes: 6 additions & 5 deletions R/plotting.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ plot_model_pars <- function(vst_out) {
geom_point(data=df, size=0.5, alpha=0.5, shape=16) +
geom_point(data=df_fit, size=0.66, alpha=0.5, shape=16) +
facet_wrap(~ parameter, scales = 'free_y') +
theme_gray(base_size = 8) +
theme(legend.position='bottom')
return(g)
}
Expand Down Expand Up @@ -75,14 +74,16 @@ get_nb_fit <- function(x, umi, gene, cell_attr, as_poisson = FALSE) {
#' @param as_poisson Fix model parameter theta to Inf, effectively showing a Poisson model
#' @param arrange_vertical Stack individual ggplot objects or place side by side
#' @param show_density Draw 2D density lines over points
#' @param gg_cmds Additional ggplot layer commands
#'
#' @return A ggplot object
#'
#' @export
#'
plot_model <- function(x, umi, goi, x_var = x$arguments$latent_var[1], cell_attr = x$cell_attr,
do_log = TRUE, show_fit = TRUE, show_nr = FALSE, plot_residual = FALSE,
batches = NULL, as_poisson = FALSE, arrange_vertical = TRUE, show_density = TRUE) {
batches = NULL, as_poisson = FALSE, arrange_vertical = TRUE, show_density = TRUE,
gg_cmds = NULL) {
require('ggplot2')
require('gridExtra')
if (is.null(batches)) {
Expand Down Expand Up @@ -138,7 +139,7 @@ plot_model <- function(x, umi, goi, x_var = x$arguments$latent_var[1], cell_attr
if (do_log) {
g <- g + ylab('Gene log10(UMI + 1)')
}
g <- g + theme_gray(base_size = 8)
g <- g + gg_cmds
if (length(goi) == 1) {
g <- g + theme(strip.text = element_blank())
}
Expand All @@ -149,7 +150,7 @@ plot_model <- function(x, umi, goi, x_var = x$arguments$latent_var[1], cell_attr
g2 <- ggplot(df, aes(x, res)) + geom_point(alpha = 0.5, shape=16) +
coord_cartesian(ylim = res_range) +
facet_grid(~gene) + xlab(x) + ylab('Pearson residual') +
xlab(paste('Cell', x_var)) + theme_gray(base_size = 8) +
xlab(paste('Cell', x_var)) + gg_cmds +
theme(strip.text = element_blank()) # strip.background = element_blank(),
if (show_density) {
g2 <- g2 + geom_density_2d(color = 'lightblue', size=0.5)
Expand All @@ -158,7 +159,7 @@ plot_model <- function(x, umi, goi, x_var = x$arguments$latent_var[1], cell_attr
g3 <- ggplot(df, aes(x, res_nr)) + geom_point(alpha = 0.5, shape=16) +
coord_cartesian(ylim = res_range) +
facet_grid(~gene) + xlab(x) + ylab('Pearson residual non-reg.') +
xlab(paste('Cell', x_var)) + theme_gray(base_size = 8) +
xlab(paste('Cell', x_var)) + gg_cmds +
theme(strip.text = element_blank()) # strip.background = element_blank(),
if (show_density) {
g3 <- g3 + geom_density_2d(color = 'lightblue', size=0.5)
Expand Down
6 changes: 5 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ is_outlier <- function(y, x, n, th = 40) {
bins <- cut(rank(x, ties.method = 'first'), n.bins, ordered_result = TRUE)
tmp <- aggregate(x = y, by = list(bin=bins), FUN = robust_scale)
score <- rep(0, length(x))
score[o] <- as.numeric(t(tmp$x))
if (class(tmp$x) == 'list') {
score[o] <- unlist(tmp$x)
} else {
score[o] <- as.numeric(t(tmp$x))
}
return(score > th)
}

Expand Down
32 changes: 28 additions & 4 deletions R/vst.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#' @param latent_var The dependent variables to regress out as a character vector; must match column names in cell_attr; default is c("log_umi_per_gene")
#' @param batch_var The dependent variables indicating which batch a cell belongs to; no batch interaction terms used if omiited
#' @param n_genes Number of genes to use when estimating parameters (default uses 2000 genes, set to NULL to use all genes)
#' @param method Method to use for initial parameter estimation; one of 'poisson', 'nb_fast', 'nb'
#' @param res_clip_range Numeric of length two specifying the min and max values the results will be clipped to
#' @param bin_size Number of genes to put in each bin (to show progress)
#' @param min_cells Only use genes that have been detected in at least this many cells
Expand All @@ -29,7 +30,7 @@
#'
#' @import Matrix
#' @import parallel
#' @importFrom MASS theta.ml
#' @importFrom MASS theta.ml glm.nb negative.binomial
#' @importFrom stats glm ksmooth model.matrix as.formula approx density poisson var
#' @importFrom utils txtProgressBar setTxtProgressBar
#'
Expand All @@ -43,6 +44,7 @@ vst <- function(umi,
latent_var = c('log_umi_per_gene'),
batch_var = NULL,
n_genes = 2000,
method = 'poisson',
res_clip_range = c(-50, 50),
bin_size = 256,
min_cells = 5,
Expand Down Expand Up @@ -106,9 +108,31 @@ vst <- function(umi,
X = genes_bin_regress,
FUN = function(j) {
y <- umi_bin[j, ]
fit <- glm(as.formula(model_str), data = cell_attr, family = poisson)
theta <- as.numeric(x = theta.ml(y = y, mu = fit$fitted))
return(c(theta, fit$coefficients))
if (method == 'poisson') {
fit <- glm(as.formula(model_str), data = cell_attr, family = poisson)
theta <- as.numeric(x = theta.ml(y = y, mu = fit$fitted))
return(c(theta, fit$coefficients))
}
if (method == 'nb_fast') {
fit <- glm(as.formula(model_str), data = cell_attr, family = poisson)
theta <- as.numeric(x = theta.ml(y = y, mu = fit$fitted))
fit2 <- 0
try(fit2 <- glm(as.formula(model_str), data = cell_attr, family = negative.binomial(theta=theta)), silent=TRUE)
if (class(fit2)[1] == 'numeric') {
return(c(theta, fit$coefficients))
} else {
return(c(theta, fit2$coefficients))
}
}
if (method == 'nb') {
fit <- 0
try(fit <- glm.nb(as.formula(model_str), data = cell_attr), silent=TRUE)
if (class(fit)[1] == 'numeric') {
fit <- glm(as.formula(model_str), data = cell_attr, family = poisson)
fit$theta <- as.numeric(x = theta.ml(y = y, mu = fit$fitted))
}
return(c(fit$theta, fit$coefficients))
}
}
)
)
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# sctransform
R package for single cell expression data transformation and normalization
R package for modeling single cell UMI expression data using regularized negative binomial regression

This packaged was developed by Christoph Hafemeister in [Rahul Satija's lab](https://satijalab.org/) at the New York Genome Center. A previous version of this work was used in the paper [Developmental diversification of cortical inhibitory interneurons, Nature 555, 2018](https://github.com/ChristophH/in-lineage) and is, in part, also implemented in [Seurat](https://satijalab.org/seurat/), an R package designed for QC, analysis, and exploration of single cell RNA-seq data.

This package is in beta status, please sanity check any results, and notify me of any issues you find.

## Quick start
`devtools::install_github(repo = 'ChristophH/sctransform')`
Expand Down
14 changes: 8 additions & 6 deletions inst/doc/batch_correction.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
## ----setup, include = FALSE----------------------------------------------
library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
library('knitr')
knit_hooks$set(optipng = hook_optipng)
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
digits = 2,
optipng = '-o7',
fig.width=5, fig.height=3, dpi=100, out.width = '70%'
)

## ----load_packages, include=FALSE----------------------------------------
library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
old_theme <- theme_set(theme_classic(base_size=8))

## ------------------------------------------------------------------------
# some of the vst steps can use multiple cores, set number here
Expand Down
15 changes: 8 additions & 7 deletions inst/doc/batch_correction.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,20 @@ vignette: >
---

```{r setup, include = FALSE}
library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
library('knitr')
knit_hooks$set(optipng = hook_optipng)
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
digits = 2,
optipng = '-o7',
fig.width=5, fig.height=3, dpi=100, out.width = '70%'
)
```

```{r load_packages, include=FALSE}
library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
old_theme <- theme_set(theme_classic(base_size=8))
```

In this vignette we show how the regression model in the variance stabilizing transformation can also be used to mitigate batch effects. We use data from [Shekhar et al., Cell 2016](https://www.ncbi.nlm.nih.gov/pubmed/27565351) to demonstrate the functionality. The original analysis and data are available [here](https://github.com/broadinstitute/BipolarCell2016).
Expand Down
14 changes: 8 additions & 6 deletions inst/doc/denoising.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
## ----setup, include = FALSE----------------------------------------------
library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
library('knitr')
knit_hooks$set(optipng = hook_optipng)
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
digits = 2,
optipng = '-o7',
fig.width=6, fig.height=4, dpi=100, out.width = '70%'
)

## ----load_packages, include=FALSE----------------------------------------
library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
old_theme <- theme_set(theme_classic(base_size=8))

## ------------------------------------------------------------------------
options(mc.cores = 4)
Expand Down
15 changes: 8 additions & 7 deletions inst/doc/denoising.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,20 @@ vignette: >
---

```{r setup, include = FALSE}
library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
library('knitr')
knit_hooks$set(optipng = hook_optipng)
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
digits = 2,
optipng = '-o7',
fig.width=6, fig.height=4, dpi=100, out.width = '70%'
)
```

```{r load_packages, include=FALSE}
library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
old_theme <- theme_set(theme_classic(base_size=8))
```

In this vignette we show how the regression model in the variance stabilizing transformation can be used to output de-noised data. Implicitly there are two levels of smoothing/de-noising when we apply the standard workflow using `vst`. First we specify latent variables that are used in the regression model - their contribution to the overall variance in the data will be removed. Second, we usually perform dimensionality reduction, which acts like a smoothing operation. Here we show how to reverse these two operations to obtain de-noised UMI counts.
Expand Down
40 changes: 21 additions & 19 deletions inst/doc/differential_expression.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
## ----setup, include = FALSE----------------------------------------------
library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
library('knitr')
knit_hooks$set(optipng = hook_optipng)
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
digits = 2,
optipng = '-o7',
fig.width=4, fig.height=2.5, dpi=100, out.width = '70%'
)

## ----load_packages, include=FALSE----------------------------------------
library('Matrix')
library('ggplot2')
library('reshape2')
library('sctransform')
old_theme <- theme_set(theme_classic(base_size=10))

## ----load_data-----------------------------------------------------------
pbmc_clusters <- readRDS(file = "~/Projects/data/pbmc3k_celltypes.rds")
pbmc_data <- readRDS(file = "~/Projects/data/pbmc3k_umi_counts.Rds")
pbmc_data <- pbmc_data[, names(pbmc_clusters)]

class(pbmc_data)
dim(pbmc_data)

Expand All @@ -23,14 +28,10 @@ set.seed(43)
vst_out <- sctransform::vst(pbmc_data, latent_var = c('log_umi_per_gene'), return_gene_attr = TRUE, return_cell_attr = TRUE, show_progress = FALSE)

## ------------------------------------------------------------------------
var_genes <- rownames(vst_out$gene_attr)[scale(vst_out$gene_attr$residual_variance)[, 1] > 1]
pca <- irlba::prcomp_irlba(t(vst_out$y[var_genes, ]), n = 20)
ce <- pca$x[, 1:10]
hcl <- hclust(dist(ce), method='ward.D2')
clustering <- cutree(hcl, 10)

## ------------------------------------------------------------------------
res1 <- sctransform::compare_expression(x = vst_out, umi = pbmc_data, group = clustering, val1 = 1, val2 = 2, show_progress = FALSE)
res1 <- sctransform::compare_expression(x = vst_out, umi = pbmc_data, group = pbmc_clusters,
val1 = 'CD14+ Monocytes',
val2 = 'FCGR3A+ Monocytes',
show_progress = FALSE)

## ------------------------------------------------------------------------
head(subset(res1, log_fc < 0), 10)
Expand All @@ -40,15 +41,16 @@ head(subset(res1, log_fc > 0), 10)
ggplot(res1, aes(log_fc, -log10(p_value))) + geom_point(alpha=0.4, shape=16)

## ------------------------------------------------------------------------
res2 <- sctransform::compare_expression(x = vst_out, umi = pbmc_data, group = clustering, val1 = setdiff(clustering, 3), val2 = 3, show_progress = FALSE)
res2 <- sctransform::compare_expression(x = vst_out, umi = pbmc_data, group = pbmc_clusters, val1 = setdiff(pbmc_clusters, 'CD8 T cells'), val2 = 'CD8 T cells', show_progress = FALSE)
head(subset(res2, log_fc > 0), 10)

## ---- fig.width=4, fig.height=4, out.width='49%', fig.show='hold'--------
goi <- rownames(subset(res2, log_fc > 0))[1:3]
df <- melt(t(as.matrix(pbmc_data[goi, ])), varnames = c('cell', 'gene'), value.name = 'UMI')
df$cluster <- clustering
ggplot(df, aes(x = gene, y = log10(UMI + 1), color = cluster == 3)) + geom_violin(scale = 'width')
df$cluster <- pbmc_clusters
ggplot(df, aes(x = gene, y = log10(UMI + 1), color = cluster == 'CD8 T cells')) + geom_violin(scale = 'width')
df <- melt(t(as.matrix(vst_out$y[goi, ])), varnames = c('cell', 'gene'), value.name = 'Pearson_residual')
df$cluster <- clustering
ggplot(df, aes(x = gene, y = Pearson_residual, color = cluster == 3)) + geom_violin(scale = 'width')
df$cluster <- pbmc_clusters
ggplot(df, aes(x = gene, y = Pearson_residual, color = cluster == 'CD8 T cells')) + geom_violin(scale = 'width')


Loading

0 comments on commit 5ef3509

Please sign in to comment.