Add digital tex/docx cheatsheets
dcetin committed Sep 25, 2019
commit 52f17ac
Showing 46 changed files with 1,643 additions and 0 deletions.
6 changes: 6 additions & 0 deletions
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,9 @@ These are a portion of the notes I kept for the lectures in my Master's in ETH Z
Notes are by no means intended to be complete or comprehensive. If you see some gaps or omitted details it is possibly because either I find the topic very general or I did not really understand it at all. Me being lazy to format it could be another possible reason. That said, I welcome any suggestions on additional content.

Similarly, there could be mistakes in the notes either because I copied and pasted parts from various sources or that I misunderstood the content. Please send me a pull request or an e-mail if I have a typo or any kind of misinformation in the notes.

Cheatsheets that were based on someone else's original work are as follows:

- AML cheatsheet adapted from [here](
- CIL cheatsheet adapted from [here](, which in turn is a fork of [this](
- PAI cheatsheet adapted from [this]( in [here](
Binary file added cheatsheets/mlhc-cheatsheet.pdf
$f(x) = \frac{1}{\sqrt{2\pi \sigma^2}} e^{- \frac{1}{2} \frac{(x-\mu)^2}{\sigma^2}},\quad \mathcal{N}(x|\mu, \sigma)$\\
$f(x) = \frac{1}{\sqrt{(2\pi)^d\det\Sigma}} e^{- \frac{1}{2} (x-\mu)^T \Sigma^{-1} (x-\mu)},\quad \mathcal{N}(x|\mu, \Sigma)$\\
Condition number: $\kappa(A)=\frac{\sigma_{max}(A)}{\sigma_{min}(A)}$ \\
f(x) on a: $f(a)+\tfrac{f'(a)}{1!}(x-a) + \tfrac{f''(a)}{2!}(x-a)^2 + ...$ \\
Binomial: $f(k,n,p) {=} Pr(X=k) {=} \binom nk p^k (1{-}p)^{n{-}k}$ \\
$\ln(p(x|\mu, \Sigma)) {=} {-}\tfrac{d}{2}\ln(2\pi) {-} \tfrac{\ln|\Sigma|}{2} {-} \tfrac{1}{2}(x{-}\mu)^T\Sigma(x{-}\mu)$ \\
$X {\sim} \mathcal{N}(\mu,\Sigma)$, $Y{=}A{+}BX \Rightarrow Y{\sim}\mathcal{N}(A{+}B\mu,B\Sigma B^T)$ //
General p-norm: $\norm{ x }_p = (\sum_{i=1}^n |x_i|^p)^{1/p}$

% Variance
\item $Var[X]=\int_x(x-\mu)^2p(x) dx$ \\
\item $Var[X]=E[(X-E[X])^2]=E[X^2]-E[X]^2$ \\
\item $Var[X{+}Y]=Var[X]{+}Var[Y]{+}2Cov[X,Y]$ \\
% Covariance
\item $Cov[X,Y] = E[(X - E[X])(Y - E[Y])]$ \\
\item $Cov[aX,bY]{=}abCov[X,Y]$ \\
\item $K_{\bm{XY}} = cov(X,Y) = E[XY^T] - E[X]E[Y^T]$
\item Part.: $\int u(x)v'(x) dx = u(x)v(x) - \int v(x)u'(x) dx$\\
\item Chain r.: $\frac{f(y)}{g(x)} = \frac{dz}{dx} \Big|_{x=x_0}= \frac{dz}{dy}\Big|_{z=g(x_0)}\cdot \frac{dy}{dx} \Big|_{x=x_0}$ \\
%\item $g_x(1) = g_x(0) + g'_x(0) + \int_{0}^{1} g_x''(s)(1-s) ds$ \\
%\item $g(\mathbf{w}+\delta) - g(\mathbf{w}) = %\int_{\mathbf{w}}^{\mathbf{w+\delta}} \nabla g(\mathbf{u}) du = (\int_{0}^{1} \nabla g(\mathbf{w}+t\delta)dt) \cdot \delta$\\
\item $\frac{\partial}{\partial \mathbf{x}}(\mathbf{b}^\top \mathbf{x}) = \frac{\partial}{\partial \mathbf{x}}(\mathbf{x}^\top \mathbf{b}) = \mathbf{b}$
\item $\frac{\partial}{\partial \mathbf{x}}(\mathbf{x}^\top \mathbf{x}) = 2\mathbf{x}$ \\
\item $\frac{\partial}{\partial \mathbf{x}}(\mathbf{x}^\top \mathbf{A}\mathbf{x}) = (\mathbf{A}^\top + \mathbf{A})\mathbf{x} \stackrel{\text{\tiny A sym.}}{=} 2\mathbf{A}\mathbf{x}$ \\
\item $\frac{\partial}{\partial \mathbf{x}}(\mathbf{b}^\top \mathbf{A}\mathbf{x}) = \mathbf{A}^\top \mathbf{b}$
\item $\frac{\partial}{\partial \mathbf{X}}(\mathbf{c}^\top \mathbf{X} \mathbf{b}) = \mathbf{c}\mathbf{b}^\top$ \\
\item $\frac{\partial}{\partial \mathbf{X}}(\mathbf{c}^\top \mathbf{X}^\top \mathbf{b}) = \mathbf{b}\mathbf{c}^\top$
\item $\frac{\partial}{\partial \mathbf{x}}(\| \mathbf{x}-\mathbf{b} \|_2) = \frac{\mathbf{x}-\mathbf{b}}{\|\mathbf{x}-\mathbf{b}\|_2}$ \\
\item $\frac{\partial}{\partial \mathbf{x}}(\|\mathbf{x}\|^2_2) = \frac{\partial}{\partial \mathbf{x}} (\|\mathbf{x}^\top \mathbf{x}\|_2) = 2\mathbf{x}$
\item $\frac{\partial}{\partial \mathbf{X}}(\|\mathbf{X}\|_F^2) = 2\mathbf{X}$ \\
\item $x^T A x = Tr(x^T A x) = Tr(x x^T A) = Tr(A x x^T)$ \\
\item $\tfrac{\partial}{\partial A} Tr(AB) {=} B^T$
\item $\frac{\partial}{\partial A} log|A| {=} A^{-T}$ \\
\item $\text{sigmoid}(x) = \sigma(x) = \frac{1}{1+\exp(-x)}$ \\
\item $\nabla \text{sigmoid}(x) = \text{sigmoid}(x)(1-\text{sigmoid}(x))$ \\
\item $\nabla \text{tanh}(x) = 1-\text{tanh}^2(x)$
\item $tanhx {=} \frac{sinhx}{coshx} {=} \frac{e^{x}-e^{-x}}{e^{x} + e^{x}}$
\subsection*{Probability / Statistics}
\item[Bayes' Rule]$ P(Y|X) = \frac{P(X|Y)P(Y)}{P(X)}\frac{P(X|Y)P(Y)}{\sum\limits^k_{i=1}P(X|Y_i)P(Y_i)}$\\
\item[MGF] $\mathbf{M}_X(t)=\mathbb{E}[e^{\mathbf{t}^T \mathbf{X}}]$, $\mathbf{X}=(X_1,.., X_n) $
\subsection*{Jensen's inequality}
X:random variable \& $\varphi$:convex function $\rightarrow$ $\varphi(\mathbb{E}[X]) \leq \mathbb{E}[\varphi(X)]$
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
\section{Neural Network}
For each unit $j$ on the output layer:\\
- Compute error signal: $\delta_j = \ell_j'(f_j)$\\
- For each unit $i$ on layer $L$: $\frac{\partial}{\partial w_{j,i}} = \delta_j v_i$

For each unit $j$ on hidden layer $l=\{L-1,..,1\}$:\\
- Error signal: $\delta_j = \phi'(z_j) \sum_{i\in Layer_{l+1}} w_{i,j}\delta_i$\\
- For each unit $i$ on layer $l-1$: $\frac{\partial}{\partial w_{j,i}} = \delta_j v_i$
% -*- root: Main.tex -*-
\section{Time series}
\subsection*{Markov Model}
Markov assumption: $P(Y_t|Y_{1:t-1}) = P(Y_t|Y_{t-1})$\\
Stationarity assumption:\\
$P(Y_{t+1}=y_1|Y_t=y_2) = P(Y_t=y_1|Y_{t-1}=y_2)$\\
Product rule:\\
$P(Y_t,...,Y_1) = P(Y_t|Y_{t-1},...,Y_1)\cdot ... \cdot P(Y_1)$\\
Sum rule:\\
$P(Y_{t+2}|Y_{1:t}) = \sum_{Y_{t+1}^i} P(Y_{t+2}Y_{t+1}^1|Y_{1:t})$
\subsection*{Hidden Markov Model}
triplet $M = (\Sigma, Q, \Theta)$\\
$\Sigma$ symbols, $Q$ states, $\Theta=(A,E)$ transition and emission, $e_k(b)$ emission prob. $x_k \in Q, b \in \Sigma$
\subsection*{Forward/Backward - Alternative}
Goal: $P(x_t|s) \propto P(x_t,s) = P(s_{t+1:n}|x_t)P(x_t,s_{1:k})$
\subsection*{Evaluation (Forward/Backward)}
Transition A and emission E known. Sequence s given.\\
Wanted: prob that s is generated by HMM.\\
Wanted: $f_l(s_t) = P(x_t = l, s_{1:t})$\\
$f_l(s_{t+1}) = e_l(s_{t+1})\sum_k f_k(s_t) a_{k,l}$,\\
$f_l(s_1) = \pi_l e_l(s_1) \forall l \in Q$\\
Wanted: $b_l(s_t) = P(s_{t+1:n}|x_t = l)$\\
$b_l(s_t) = \sum_k e_k(s_{t+1}) b_k(s_{t+1}) a_{l,k}$,\\
$b_l(s_n) = 1 \forall l \in Q$\\
Complexity in time: $\mathcal{O}(|\Sigma|^2 \cdot T)$

\subsection*{Decoding (Viterbi)}
Given: Observation sequence $O= \{O_1 O_2 \dots O_T \}$, $a_{ij} = P(q_{t+1} = S_j | q_t = S_i)$, $b_j(k)=P(v_k \text{at t} |q_t = S_j)$ \\
Wanted: most likely path $Q = \{q_1,q_2,\ldots q_T\}$\\
$\delta_t (i) $ best score along single path, at a time t, which accounts for the first t observations and ends in $S_i$\\
$\delta_t (j) = max_{1 \leq i \leq N}[\delta_{t-1} (i)a_{ij}]b_j(O_t) $\\
$\phi_t(j)=argmax_{1\leq i \leq N} [\delta_{t-1}(i)a_{ij}]$\\
Time: $\mathcal{O}(|S|^2 \cdot T)$
Space $\mathcal{O}(|S| \cdot T)$

\subsection*{Decoding (Viterbi) - Alternative}
Transition $a_{i,j} = P(x_{t+1} = j |x_t = i)$ and emission $e_l(s_t) = P(s_t|x_t=l)$ known. Sequence s given.\\
Wanted: Most likely path x responsible for the sequence.\\
$v_l(s_{t+1}) = e_l(s_{t+1}) \max_k(v_k(s_t) a_{k,l})$\\
$v_l(s_1) = \pi_l e_l(s_1) \forall l \in Q$\\
Time: $\mathcal{O}(|\Sigma|^2 \cdot T)$, Space: $\mathcal{O}(|\Sigma| \cdot T)$
\subsection*{Learning (Baum-Welch)}
Know: Set of sequences $s^1,...,s^m$\\
Wanted: max transition A and emission E\\
\textbf{E-step I:} Compute all $f_k(s_t^j)$ (forward-algo.) \& $b_k(s_t^j)$ (backward algo.)\\
\textbf{E-step II:} Compute $A_{kl}$, $E_k(b)$ for all states and symbols\\
$A_{kl} = \sum_{j=1}^{m} \frac{1}{P(\textbf{s}^j)} \sum_{t=1}^{n}f_k^j (s_t^j)a_{kl}e_l(s_{t+1}^j)b_l^j(s_{t+1}^j)$\\
\textbf{M-step:} Compute param. estimates $a_{kl}$, $e_k(b)$\\
$a_{kl}=\frac{A_{kl}}{\sum_{i=1}^{n}A_{ki}}$, $e_k(b)=\frac{E_k(b)}{\sum_{b'}E_k(b')}$\\
Complexity: $\mathcal{O}(|\Sigma|^2)$ in storage (space).
%\subsection*{Linear Regression}
%Error: $\hat{R}(w) = \sum_{i=1}^n (y_i - w^Tx_i)^2 = ||Xw-y||^2_2$\\
%Closed form: $w^*=(X^T X)^{-1} X^T y$\\
%Gradient: $\nabla_w \hat{R}(w) = 2X^T (Xw-y)$
Consistency: $\hat{\theta_n} \stackrel{\text{\tiny P}}{\rightarrow} \theta$,
i.e. $\forall\epsilon P \{|\hat{\theta_n}-\theta| \geq\epsilon\} \stackrel{\tiny n \to\infty}{\longrightarrow} 0 $\\
Asymptotic normality: $\sqrt{N}(\theta - \hat{\theta_n}) \to \mathcal{N}(0, J^{-1}IJ^{-1})$ \\
Asymptotic efficiency: $\hat{\theta_n}$ has the smallest variance among all possible consistent estimators (for large enough N), i.e. $\lim_{n\to\infty} (V[\hat{\theta_n}]I(\theta))^{-1} = 1$
$\hat{\theta}_{MAP} := \argmax_\theta \left \{ \sum_{i=1}^n log(p(x_i | \theta) + log(p(\theta)) \right\}$
$\Lambda = \frac{\partial \log \mathbb{P}(x|\theta )}{\partial x}$ (score function), $E[\Lambda ]=0$\\
Fisher information: $I= \mathbb{V}[\Lambda]$ \\
$J= E[\Lambda^{2}]= -E[\frac{\partial^2 \log \mathbb{P}(x|\theta ) }{\partial \theta \partial \theta ^{T}}]= -E[\frac{\partial \Lambda}{\partial \theta}]$ \\
variance of an estimator is bounded from below by the inverse of Fisher information \\
MSE bound: $E[(\hat \theta -\theta )^{2}] \geq \frac{[1 + b^{\prime} (\theta)]^{2}}{n E[\Lambda ^{2}]} + b_{\hat \theta}^{2}$ \\
Biased estimators: $var(\hat{\theta}) \geq \frac{[1 + b^{\prime}(\theta)]^2}{I(\theta)}$ \\
Efficiency: $e(\hat{\theta}) = \frac{I(\theta)^{-1}}{var(\hat{\theta})} \leq 1$ \\
Cauchy-Schwarz: $|E(X,Y)|^2 \leq E(X)^2 E(Y)^2$

\subsection*{Regularized regression}
Error: $\hat{R}(w) = \sum \limits_{i=1}^n (y_i - w^Tx_i)^2 + \lambda ||w||_2^2$ (Ridge) \\
Closed form: $w^*=(X^T X + \lambda I)^{-1} X^T y$ (Ridge)\\
%Grad: $\nabla_w \hat{R}(w) = -2 \sum_{i=1}^n (y_i-w^T x_i) \cdot x_i + 2 \lambda w$\\
{\small{} Shrinkage:} $Xw^*{=}\sum_{j=1}^{d} u_j\frac{\sigma_j^2}{\sigma_j^2+\lambda}u_j^Ty$, $X{=}U\Sigma V^T$
LASSO: $w^* = \underset{w}{\operatorname{argmin}} \sum \limits_{i=1}^n (y_i - w^Tx_i)^2 + \lambda ||w||_1$

\subsection*{Bayesian linear regression}
Model: \= $y = X^T \beta + \epsilon$, with $\epsilon \sim
\mathcal{N}(\epsilon | 0, \sigma^2 I)$ or
\> $P(y | X, \beta, \sigma) = \mathcal{N}(y | X^T \beta , \sigma^2 I)$
$P(\beta | \Lambda) = \mathcal{N} (\beta | 0, \Lambda^{-1})$, Post: $P(\beta | X, y, \Lambda) = \mathcal{N}(\beta | \mu_\beta, \Sigma_\beta)$
$\mu_\beta = (X^T X + \sigma^2 \Lambda)^{-1} X^T y$, $\Sigma_\beta = \sigma^2(X^T X + \sigma^2 \Lambda)^{-1}$
Prediction: \> $y_{new} = \hat{\beta}_{\scaleto{MAP}{4pt}}^T x_{new} = \mu_\beta ^T x_{new}$
$P(y_{new} | x_{new}, X, y, \beta)
= \mathcal{N}(\mu_\beta ^T x_{new}, \sigma^2 + x_{new}^T \Sigma_\beta x_{new})$

\subsection*{Combination of Regression Models:}
$\text{bias}[\hat{f}(x)] = \frac{1}{B} \sum_{i=1}^{B} \text{bias}[\hat{f}_i(x)]$\\
Var$[\hat{f}(x)] = \frac{1}{B^2}\sum_i$ Var$[\hat{f}_i(x)]
+ \frac{1}{B^2}\sum_{i,j:i\neq j}$ Cov$[\hat{f}_i(x), \hat{f}_j(x)] \approx \frac{\sigma^2}{B}$
% \subsection*{Smoothing Splines}
% $RSS(f,\lambda) = \sum\limits_{i=1}^n (y_i - f(x_i))^2 + \lambda \int (f''(x))^2dx$\\

\subsection*{RSS Estimator}
$\hat{\beta} \sim \mathcal{N}(\beta,(X^TX)^{-1}\sigma^2)$.
%\textbf{Unbiasedness}: $\mathbb{E}[\hat{\beta}] = \mathbb{E}[(X^TX)^{-1}X^Ty] = (X^TX)^{-1}X^T\mathbb{E}[X\beta+\epsilon] = (X^TX)^{-1}(X^TX)\beta+X^T\mathbb{E}[\epsilon] = \beta + 0$
%\textbf{Variance of} $a^T\hat{\beta}$: $\mathbb{V}(a^T(X^TX)^{-1}X^T(X\beta + \epsilon)) = \mathbb{V}(a^T\beta) + \mathbb{E}(a^T(X^TX)^{-1}X^T\epsilon\epsilon^TX(X^TX)^{-1}a) = \sigma^2 a^T(X^TX)^{-1}a$

%\subsection*{Gauss-Markov Theorem}
%For any linear estimator $\widetilde{\theta}=c^T\mathbf{y}$ that is unbiased for $a^T\beta$ it holds: $\mathbb{V}(a^T\hat{\beta}) \leq \mathbb{V}(c^T\mathbf{y})$\\
%Proof: Let $c^T \mathbf{y} = a^T\hat{\beta} + a^T\mathbf{D}\mathbf{y} = a^T((\mathbf{X^TX})^{-1}\mathbf{X}^T + \mathbf{D})\mathbf{y}$ be an unbiased estimator of $a^T \beta$; then it follow $a^T \mathbf{DX}\beta = 0$ which implies $\mathbf{DX} = 0$.\\
%$\mathbb{V}(c^T \mathbf{y}) = \mathbb{E}[(c^T \mathbf{y})^2]-\mathbb{E}(c^T \mathbf{y})^2 = c^T(\mathbb{E}\mathbf{y}\mathbf{y}^T - \mathbb{E}\mathbf{y}\mathbb{E}\mathbf{y}^T)c = \sigma^2 c^T c $
%= $\sigma^2 \big( a^T ((\mathbf{X^T X})^{-1}\mathbf{X}^T + \mathbf{D}) (\mathbf{X}(\mathbf{X^T X})^{-1}+\mathbf{D}^T)a \big )$\\
%= $\sigma^2 \big( a^T (\mathbf{X^T X})^{-1}a +\mathbf{DD^T}a \big )$
%= $\mathbb{V}(a^T\hat{\beta}) + a^T \mathbf{DD^T}a \geq \mathbb{V}(a^T\hat{\beta})$ (note: $\mathbf{DD^T}$ is PSD)

\subsection*{Bias vs. Variance}
\E_D\E_{X,Y}\left(\hat{f}(X)-Y\right)^2 = \\
\E_D\E_X\left(\hat{f}(X) - \E(Y|X)\right)^2 + \E_{X,Y}\left(Y - \E(Y|X)\right)^2\\
= \E_X \E_D\left(\hat{f}(X) - \E_D(\hat{f}(X))\right)^2 \text{(variance)}\\
+ \E_X\left(\E_D(\hat{f}(X)) - \E(Y|X)\right)^2 \text{(bias}^2)\\
+ \E_{X,Y}\left(Y - \E(Y|X)\right)^2 \text{(noise)}
%High bias can cause an algorithm to miss the relevant relations between features and target outputs (underfitting).\\
%High variance can cause overfitting: modeling the random noise in the training data, rather than the intended outputs.

% \subsection*{Gradient Descent}
% 1. Start arbitrary $w_o \in \mathbb{R}$\\
% 2. For $i$ do $w_{t+1} = w_t - \eta_t \nabla \hat{R}(w_t)$

%\subsection*{Curse of Dimensionality}
%To obtain a reliable estimate at a given regularity, the required number of samples grows exponentially with the dimension of the sample space.

% \subsection*{Expected Error}
% For generalization, minimize the expected error
% $R(w) = \int P(x,y) (y-w^Tx)^2 \partial x \partial y$\\
% $= \mathbb{E}_{x,y}[(y-w^Tx)^2]$

\subsection*{Ridge Parametric to nonparametric}
Ansatz: $w=\sum_i \alpha_i x$\\
$w^* = \underset{w}{\operatorname{argmin}} \sum_i (w^Tx_i-y_i)^2 + \lambda ||w||_2^2$ = \\
${\operatorname{argmin}}_{\alpha_{1:n}} \sum_{i=1}^n (\sum_{j=1}^n \alpha_j x_j^T x_i - y_i)^2 + \lambda \sum_i \sum_j \alpha_i \alpha_j (x_i^T x_j)$\\
$= {\operatorname{argmin}}_{\alpha_{1:n}} \sum_{i=1}^n (\alpha^T K_i - y_i)^2 + \lambda \alpha^T K \alpha$\\
$= {\operatorname{argmin}}_{\alpha} ||\alpha^T K -y||_2^2 + \lambda \alpha^T K \alpha$\\
Closed form: $\alpha^* = (K+\lambda I)^{-1} y$\\
Prediction: $y^*= w^{*T} x = \sum_{i=1}^n \alpha_i^* k(x_i,x)$
\section{Bayesian Methods}
$\theta^* = \operatorname{argmax}_\theta P(y|x,\theta) $\\
$= {\operatorname{argmax}}_\theta \prod_{i=1}^n P(y_i|x_i, \theta) \text{\quad (iid)}$\\
$= {\operatorname{argmax}}_\theta \sum_{i=1}^n log P(y_i|x_i,\theta)$

$w^* = \underset{w}{\operatorname{argmax}} P(w|x,y) = \underset{w}{\operatorname{argmax}} \frac{P(w|x) P(y|x,w)}{P(y|x)}$\\
$=\underset{w}{\operatorname{argmax}} log P(w) + \sum_i log P(y_i|x_i,w) + const.$

\subsection*{MLE = MAP}
$n \rightarrow \infty$ or prior is uniformly distr.

