This repository has been archived by the owner on Feb 6, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
2018_zinbwave_droplasso.tex
425 lines (341 loc) · 12.9 KB
/
2018_zinbwave_droplasso.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
\documentclass{beamer}
%
% Choose how your presentation looks.
%
% For more themes, color themes and font themes, see:
% http://deic.uab.es/~iblanes/beamer_gallery/index_by_theme.html
%
\mode<presentation>
{
\usetheme{default} % or try Darmstadt, Madrid, Warsaw, ...
\usecolortheme{default} % or try albatross, beaver, crane, ...
\usefonttheme{default} % or try serif, structurebold, ...
\setbeamertemplate{navigation symbols}{}
\setbeamertemplate{caption}[numbered]
}
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage{amstext}
%\usepackage{coloremoji}
\usepackage{layout}
\usepackage{multirow}
\usepackage{graphicx}
\graphicspath{ {figs/} }
\setbeameroption{hide notes}
\setbeamertemplate{note page}[plain]
\usepackage{listings}
\usepackage{datetime}
\usepackage{url}
% specifications for presenter mode
%\beamerdefaultoverlayspecification{<+->}
%\setbeamercovered{transparent}
% math shorthand
\usepackage{bm}
\usepackage{amsmath}
\usepackage{mathtools}
\newcommand{\R}{\mathbb{R}}
\newcommand{\D}{\mathcal{D}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\F}{\mathcal{F}}
\newcommand{\X}{\mathcal{X}}
\newcommand{\lik}{\mathcal{L}}
\DeclarePairedDelimiterX{\infdivx}[2]{(}{)}{%
#1\;\delimsize\|\;#2%
}
\newcommand{\infdiv}{D\infdivx}
\DeclarePairedDelimiter{\norm}{\lVert}{\rVert}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
% Bibliography
\usepackage{natbib}
\bibpunct{(}{)}{,}{a}{}{;}
\usepackage{bibentry}
%\nobibliography*
\title[zinbwave-droplasso]{Differential Expression Analysis Techniques for
Single-Cell RNA-seq Experiments}
\subtitle{\vspace*{0.5em} \scriptsize for the Computational Biology Doctoral
Seminar (CMPBIO 293),\\ organized by N.~Yosef \& T.~Ashuach, Spring 2018, UC
Berkeley}
\author{Kevin Benac and Nima Hejazi}
\institute{Group in Biostatistics,\\ University of California, Berkeley}
\date{11 April 2018}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\setcounter{tocdepth}[2]
\AtBeginSubsection[]{
\begin{frame}{Outline}
\tableofcontents[currentsection,currentsubsection]
\end{frame}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
\begin{frame}
\titlepage
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}
\subsection{Data}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{The Data: Single-Cell RNA-seq}
\begin{itemize}
\itemsep10pt
\item scRNA-seq fast growing approach to measure the genome-wide transcriptome
of many individual cells in parallel (Kolodziejczyk et al., 2015).
\item Major advance compared to standard “bulk” RNA sequencing to investigate
complex heterogeneous tissues,
\item Access to cell-to-cell variability: better accuracy.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{The Data: Single-Cell RNA-seq}
%No free lunch thm
\begin{itemize}
\itemsep10pt
\item However, analysis of single-cell RNA-seq data is challenging.
\item In one cell, only a tiny amount of RNA is present and large fraction of
polyadenylated RNA can be stochastically lost during sample preparation
steps (cell lysis, reverse transcription or amplification). \\
$\Longrightarrow$ Many genes fail to be detected although they are
expressed!
\item In practice, not uncommon to end up with a matrix of read counts where
about 80\% of the coefficients are zeros.
\item This zeros are called \textit{dropouts}.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{The Data: Single-Cell RNA-seq}
\begin{table}[ht]
\centering
\begin{tabular}{rrrrrrrrrrr}
\hline
& Cell1 & Cell 2 & Cell 3 & Cell 4 & Cell 5 & Cell 6 & Cell 7 \\
\hline
Xkr4 & 0 & 0 & 0 & 14 & 0 & 0 & 0 \\
Syt11 & 1 & 9 & 2 & 2 & 0 & 0 & 0 \\
Cpe & 0 & 0 & 16 & 0 & 0 & 0 & 0 \\
Rp1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\
Gm73 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\
Gm79 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\
Mpl15 & 8 & 8 & 6 & 1 & 0 & 0 & 0 \\
Gm61 & 0 & 0 & 0 & 0 & 0 & 3 & 0 \\
Lypla1 & 1 & 23 & 266 & 1 & 0 & 1 & 0 \\
Tcea1 & 63 & 101 & 18 & 29 & 2 & 34 & 0 \\
\hline
\end{tabular}
\end{table}
\end{frame}
\begin{frame}{The Data: Single-Cell RNA-seq}
\begin{itemize}
\itemsep10pt
\item Raises modelling and computational issues.
\item Need to detect a signal when most of the values are zeros only because
they are missing.
\item Traditional methods used for bulk RNA-seq data might not be sensible
anymore.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Objective}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{The Objective: Differential Expression}
\begin{itemize}
\itemsep12pt
\item Why ``differential''? The goal is to find a subset of relevant
biomarkers with respect to a particular condition of interest (e.g.,
disease, tissue of origin).
\item Many experimental settings seek to isolate a subset of biomarkers from
the full (larger) assayed set in order to identify biological patterns and
better inform future biological experiments.
\item Since experimental costs are high and modern biotechnologies allow
numerous biological targets (e.g., genes) to be assayed, the result is a
very high-dimensional statistical problem.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{The Objective: Differential Expression}
\begin{itemize}
\itemsep12pt
\item Regularized Linear Models:
\[
\min_{w \in \R^d} \left\{ \frac{1}{n} \sum_{i = 1}^n \lik(w, x_i, y_i) +
\lambda \Omega(w) \right\}
\]
\item Lasso for continuous outcomes (squared-error loss):
\[
\min_{w \in \R^d} \left\{ \frac{1}{n} \sum_{i = 1}^n \left(y_i - \sum_{j =
1}^d w_j x_{i,j} \right)^2 + \lambda \sum_{j = 1}^d \lvert w_j \rvert
\right\}
\]
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Methodology}
\subsection{ZINB-WaVE}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{ZINB-WaVE}
\begin{itemize}
\itemsep10pt
\item Method that leads to low-dimensional representations of the data the
same way PCA or tSNE does. \pause
\item However accounts for zero inflation (dropouts), over-dispersion, and the
count nature of the data.
\item No need for normalization.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{ZINB-WaVE}
Mathematical set-up:
\begin{itemize}
\itemsep10pt
\item $n$ samples (single-cells),
\item $J$ genes,
\item $Y_{ij}$ read counts for gene $j$ in cell $i$, $1\leq \ldots \leq n,
\quad 1\leq j \leq J.$,
\item $\pi_{ij}$: probability of dropout,
\item $\mu_{ij}$: mean expression level.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{ZINB-WaVE}
\begin{figure}
\centering
\includegraphics[height=0.5\textheight]{zinbwave}
\caption{The ZINB-WaVE model}
\end{figure}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{ZINB-WaVE}
\begin{itemize}
\itemsep10pt
\item ZINB-WaVE mainly used for normalization and dimensionality reduction but
can also be used for DE analysis.
\item Compute weights from the estimated $\pi$ using Bayes formula.
\item If the observed counts are positive, $w = 1$, otherwise, $0<w<1$.
\item The higher $\pi$, the lower $w$
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{ZINB-WaVE}
\begin{itemize}
\itemsep10pt
\item Once we have the weights, fit a weighted negative binomial generalized
linear model using the ZINB-WaVE weights.
\item End-up with a matrix of fitted values.
\item Not sparse anymore, look more like bulk RNA-seq data.\\
$\Longrightarrow$ We can use classical tools for differential expression
analysis (e.g.~edgeR, DESeq2, limma-voom in R/Bioconductor).
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{DropLasso}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{DropLasso}
\begin{itemize}
\itemsep12pt
\item Consider the following data structure:
\begin{itemize}
\itemsep10pt
\item $x_i \in \mathbb{R}^d$ --- design matrix of scRNA-seq counts
\item $y_i \in \mathbb{R}$ --- cell-level outcome of interest (e.g.,
tissue of origin)
\item $\delta_i \in \{0, 1\}^d$ s.t.~$\delta_i \sim Bern(p)^d$ --- random
dropout mask
\item $\delta \odot x \in \mathbb{R}^d$ --- corrupted pattern for
scRNA-seq dropout
\item $\text{P}(\delta_i = 1) = \text{p}$ --- probability of \textit{not}
being censored by dropout
\end{itemize}
\item The DropLasso procedure seeks to identify differentially expressed genes
based on cell-level differences while accounting for the dropout noise that
masks scRNA data.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{DropLasso}
\begin{itemize}
\itemsep12pt
\item Introducing dropout ($\delta_i \sim Bern(p)^d$):
\[
\min_{w \in \R^d} \left\{ \frac{1}{n} \sum_{i = 1}^n \E_{\delta_i}
\lik \left(w, \delta_i \odot \frac{x_i}{p}, y_i \right) + \lambda
\lVert w \rVert_1 \right\}
\]
\item Independence from $p$ in expectation:
\[
\begin{aligned}
\E_{\delta_i} \sum_{j = 1}^{d} w_j \left( \delta_i \odot \frac{x_i}{p}
\right)_j =& \sum_{j = 1}^d \E_{\delta_i} w_j \delta_{i,j}
\frac{x_{i,j}}{p} \\ =& \sum_{j = 1}^d w_j x_{i,j}
\end{aligned}
\]
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{DropLasso}
\begin{itemize}
\itemsep12pt
\item Introducing the dropout term $\delta$ amounts to censoring the observed
data and adjusting (i.e., $\frac{x_p}{p}$) such that the effects of dropout
noise are removed.
\item This places a \textit{statistical model} on the dropout noise --- i.e.,
$\delta_i \sim Bern(p)^d$
\begin{itemize}
\item Dropout noise is independent across samples and genes. (Fine
starting point but probably untrue scientifically.)
\item Modeling dropout noise in a more flexible manner could likely
improve DropLasso performance and is identified as an item of future
work.
\end{itemize}
\item Merely introducing the simple dropout correction significantly improves
performance under standard modeling metrics (e.g., AUC).
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{DropLasso}
\begin{figure}[H]
\includegraphics[width=\textwidth]{droplasso}
\caption{Excerpt from table 3 of ``DropLasso: A robust variant of Lasso for
single cell RNA-seq data'' Khalfaoui \& Vert (2018)}
\end{figure}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Conclusions}
\subsection{Comparison}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{ZINB-WaVE v.~DropLasso}
\begin{itemize}
\itemsep12pt
\item ZINB-WaVE is designed to address issues in the statistical analysis
pipeline that come before differential expression analysis:
\begin{itemize}
\item Normalization
\item Dimensionality Reduction
\end{itemize}
\item Since ZINB-WaVE attempts to make scRNA-seq data resemble bulk RNA-seq
data, the weights can be used with standard differential expression tools.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{ZINB-WaVE v.~DropLasso}
\begin{itemize}
\itemsep12pt
\item DropLasso seeks to cast the scRNA-seq DE problem as a standard Lasso
problem, accounting for dropout noise using the regularization introduced in
the neural networks literature.
\item Since DropLasso is a very new method, there have been no in-depth
comparisons of the two techniques as of yet.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% don't want dimming with references
\setbeamercovered{}
\beamerdefaultoverlayspecification{}
\begin{frame}[c,allowframebreaks]{References}
\small
\bibliographystyle{plainnat}
\nocite{*}
\bibliography{refs}
\itemize
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}