Add digital tex/docx cheatsheets

dcetin · Sep 25, 2019 · 52f17ac · 52f17ac
1 parent c7e535d
commit 52f17ac
Show file tree

Hide file tree

Showing 46 changed files with 1,643 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -5,3 +5,9 @@ These are a portion of the notes I kept for the lectures in my Master's in ETH Z
 Notes are by no means intended to be complete or comprehensive. If you see some gaps or omitted details it is possibly because either I find the topic very general or I did not really understand it at all. Me being lazy to format it could be another possible reason. That said, I welcome any suggestions on additional content.
 
 Similarly, there could be mistakes in the notes either because I copied and pasted parts from various sources or that I misunderstood the content. Please send me a pull request or an e-mail if I have a typo or any kind of misinformation in the notes.
+
+Cheatsheets that were based on someone else's original work are as follows:
+
+- AML cheatsheet adapted from [here](https://github.com/plokchen/eth-ml-exam-summary).
+- CIL cheatsheet adapted from [here](https://github.com/tyxeron/eth-cil-exam-summary), which in turn is a fork of [this](https://github.com/groggi/eth-cil-exam-summary).
+- PAI cheatsheet adapted from [this](https://legacy.amiv.ethz.ch/system/files/studiumsunterlagen/pai_zfg_final.docx) in [here](https://legacy.amiv.ethz.ch/studium/unterlagen/132).
diff --git a/cheatsheets/aml-cheatsheet.pdf b/cheatsheets/aml-cheatsheet.pdf
diff --git a/cheatsheets/cil-cheatsheet.pdf b/cheatsheets/cil-cheatsheet.pdf
diff --git a/cheatsheets/mlhc-cheatsheet.pdf b/cheatsheets/mlhc-cheatsheet.pdf
diff --git a/cheatsheets/pai-cheatsheet.pdf b/cheatsheets/pai-cheatsheet.pdf
diff --git a/Advanced Machine Learning.pdf → notes/Advanced Machine Learning.pdf b/Advanced Machine Learning.pdf → notes/Advanced Machine Learning.pdf
diff --git a/Computational Intelligence Lab.pdf → notes/Computational Intelligence Lab.pdf b/Computational Intelligence Lab.pdf → notes/Computational Intelligence Lab.pdf
diff --git a/Computer Vision.pdf → notes/Computer Vision.pdf b/Computer Vision.pdf → notes/Computer Vision.pdf
diff --git a/Machine Learning for Health Care.pdf → notes/Machine Learning for Health Care.pdf b/Machine Learning for Health Care.pdf → notes/Machine Learning for Health Care.pdf
diff --git a/Machine Perception.pdf → notes/Machine Perception.pdf b/Machine Perception.pdf → notes/Machine Perception.pdf
diff --git a/Natural Language Understanding.pdf → notes/Natural Language Understanding.pdf b/Natural Language Understanding.pdf → notes/Natural Language Understanding.pdf
diff --git a/Probabilistic Artificial Intelligence.pdf → ...Probabilistic Artificial Intelligence.pdf b/Probabilistic Artificial Intelligence.pdf → ...Probabilistic Artificial Intelligence.pdf
diff --git a/Statistical Learning Theory.pdf → notes/Statistical Learning Theory.pdf b/Statistical Learning Theory.pdf → notes/Statistical Learning Theory.pdf
diff --git a/word-embeddings.pdf → notes/word-embeddings.pdf b/word-embeddings.pdf → notes/word-embeddings.pdf
diff --git a/src/aml-cheatsheet/0Basics.tex b/src/aml-cheatsheet/0Basics.tex
@@ -0,0 +1,51 @@
+\section{Basics}
+$f(x) = \frac{1}{\sqrt{2\pi \sigma^2}} e^{- \frac{1}{2} \frac{(x-\mu)^2}{\sigma^2}},\quad \mathcal{N}(x|\mu, \sigma)$\\
+$f(x) = \frac{1}{\sqrt{(2\pi)^d\det\Sigma}} e^{- \frac{1}{2} (x-\mu)^T \Sigma^{-1} (x-\mu)},\quad \mathcal{N}(x|\mu, \Sigma)$\\
+Condition number: $\kappa(A)=\frac{\sigma_{max}(A)}{\sigma_{min}(A)}$ \\
+f(x) on a: $f(a)+\tfrac{f'(a)}{1!}(x-a) + \tfrac{f''(a)}{2!}(x-a)^2 + ...$ \\
+Binomial: $f(k,n,p) {=} Pr(X=k) {=} \binom nk p^k (1{-}p)^{n{-}k}$ \\
+$\ln(p(x|\mu, \Sigma)) {=} {-}\tfrac{d}{2}\ln(2\pi) {-} \tfrac{\ln|\Sigma|}{2} {-} \tfrac{1}{2}(x{-}\mu)^T\Sigma(x{-}\mu)$ \\
+$X {\sim} \mathcal{N}(\mu,\Sigma)$, $Y{=}A{+}BX \Rightarrow Y{\sim}\mathcal{N}(A{+}B\mu,B\Sigma B^T)$ //
+General p-norm: $\norm{ x }_p = (\sum_{i=1}^n |x_i|^p)^{1/p}$
+
+\subsection*{Moments}
+\begin{inparaitem}[\color{red}\textbullet]
+% Variance
+\item $Var[X]=\int_x(x-\mu)^2p(x) dx$ \\
+\item $Var[X]=E[(X-E[X])^2]=E[X^2]-E[X]^2$ \\
+\item $Var[X{+}Y]=Var[X]{+}Var[Y]{+}2Cov[X,Y]$ \\
+% Covariance
+\item $Cov[X,Y] = E[(X - E[X])(Y - E[Y])]$ \\
+\item $Cov[aX,bY]{=}abCov[X,Y]$ \\
+\item $K_{\bm{XY}} = cov(X,Y) = E[XY^T] - E[X]E[Y^T]$
+\end{inparaitem}
+\subsection*{Calculus}
+\begin{inparaitem}[\color{red}\textbullet]
+	\item Part.: $\int u(x)v'(x) dx = u(x)v(x) - \int v(x)u'(x) dx$\\
+	\item Chain r.: $\frac{f(y)}{g(x)} = \frac{dz}{dx} \Big|_{x=x_0}= \frac{dz}{dy}\Big|_{z=g(x_0)}\cdot \frac{dy}{dx} \Big|_{x=x_0}$ \\
+	%\item $g_x(1) = g_x(0) + g'_x(0) + \int_{0}^{1} g_x''(s)(1-s) ds$ \\
+	%\item $g(\mathbf{w}+\delta) - g(\mathbf{w}) = %\int_{\mathbf{w}}^{\mathbf{w+\delta}} \nabla g(\mathbf{u}) du = (\int_{0}^{1} \nabla g(\mathbf{w}+t\delta)dt) \cdot \delta$\\
+	\item $\frac{\partial}{\partial \mathbf{x}}(\mathbf{b}^\top \mathbf{x}) = \frac{\partial}{\partial \mathbf{x}}(\mathbf{x}^\top \mathbf{b}) = \mathbf{b}$
+	\item $\frac{\partial}{\partial \mathbf{x}}(\mathbf{x}^\top \mathbf{x}) = 2\mathbf{x}$ \\
+	\item $\frac{\partial}{\partial \mathbf{x}}(\mathbf{x}^\top \mathbf{A}\mathbf{x}) = (\mathbf{A}^\top + \mathbf{A})\mathbf{x} \stackrel{\text{\tiny A sym.}}{=} 2\mathbf{A}\mathbf{x}$ \\
+	\item $\frac{\partial}{\partial \mathbf{x}}(\mathbf{b}^\top \mathbf{A}\mathbf{x}) = \mathbf{A}^\top \mathbf{b}$
+	\item $\frac{\partial}{\partial \mathbf{X}}(\mathbf{c}^\top \mathbf{X} \mathbf{b}) = \mathbf{c}\mathbf{b}^\top$ \\
+	\item $\frac{\partial}{\partial \mathbf{X}}(\mathbf{c}^\top \mathbf{X}^\top \mathbf{b}) = \mathbf{b}\mathbf{c}^\top$
+	\item $\frac{\partial}{\partial \mathbf{x}}(\| \mathbf{x}-\mathbf{b} \|_2) = \frac{\mathbf{x}-\mathbf{b}}{\|\mathbf{x}-\mathbf{b}\|_2}$ \\
+	\item $\frac{\partial}{\partial \mathbf{x}}(\|\mathbf{x}\|^2_2) = \frac{\partial}{\partial \mathbf{x}} (\|\mathbf{x}^\top \mathbf{x}\|_2) = 2\mathbf{x}$
+	\item $\frac{\partial}{\partial \mathbf{X}}(\|\mathbf{X}\|_F^2) = 2\mathbf{X}$ \\
+	\item $x^T A x = Tr(x^T A x) = Tr(x x^T A) = Tr(A x x^T)$ \\
+	\item $\tfrac{\partial}{\partial A} Tr(AB) {=} B^T$
+	\item $\frac{\partial}{\partial A} log|A| {=} A^{-T}$ \\
+	\item $\text{sigmoid}(x) = \sigma(x) = \frac{1}{1+\exp(-x)}$ \\
+	\item $\nabla \text{sigmoid}(x) = \text{sigmoid}(x)(1-\text{sigmoid}(x))$ \\
+	\item $\nabla \text{tanh}(x) = 1-\text{tanh}^2(x)$ 
+	\item $tanhx {=} \frac{sinhx}{coshx} {=} \frac{e^{x}-e^{-x}}{e^{x} + e^{x}}$
+\end{inparaitem}
+\subsection*{Probability / Statistics}
+\begin{compactdesc}
+	\item[Bayes' Rule]$ P(Y|X) = \frac{P(X|Y)P(Y)}{P(X)}\frac{P(X|Y)P(Y)}{\sum\limits^k_{i=1}P(X|Y_i)P(Y_i)}$\\
+	\item[MGF] $\mathbf{M}_X(t)=\mathbb{E}[e^{\mathbf{t}^T \mathbf{X}}]$, $\mathbf{X}=(X_1,.., X_n) $
+\end{compactdesc}
+\subsection*{Jensen's inequality}
+	X:random variable \& $\varphi$:convex function $\rightarrow$ $\varphi(\mathbb{E}[X]) \leq \mathbb{E}[\varphi(X)]$
diff --git a/src/aml-cheatsheet/10NeuralNet.tex b/src/aml-cheatsheet/10NeuralNet.tex
@@ -0,0 +1,9 @@
+\section{Neural Network}
+\subsection*{Backpropagation}
+For each unit $j$ on the output layer:\\
+- Compute error signal: $\delta_j = \ell_j'(f_j)$\\
+- For each unit $i$ on layer $L$: $\frac{\partial}{\partial w_{j,i}} = \delta_j v_i$
+
+For each unit $j$ on hidden layer $l=\{L-1,..,1\}$:\\
+- Error signal: $\delta_j = \phi'(z_j) \sum_{i\in Layer_{l+1}} w_{i,j}\delta_i$\\
+- For each unit $i$ on layer $l-1$: $\frac{\partial}{\partial w_{j,i}} = \delta_j v_i$
diff --git a/src/aml-cheatsheet/10TimeSeries.tex b/src/aml-cheatsheet/10TimeSeries.tex
@@ -0,0 +1,53 @@
+% -*- root: Main.tex -*-
+\section{Time series}
+\subsection*{Markov Model}
+Markov assumption: $P(Y_t|Y_{1:t-1}) = P(Y_t|Y_{t-1})$\\
+Stationarity assumption:\\
+$P(Y_{t+1}=y_1|Y_t=y_2) = P(Y_t=y_1|Y_{t-1}=y_2)$\\
+Product rule:\\
+$P(Y_t,...,Y_1) = P(Y_t|Y_{t-1},...,Y_1)\cdot ... \cdot P(Y_1)$\\
+Sum rule:\\
+$P(Y_{t+2}|Y_{1:t}) = \sum_{Y_{t+1}^i} P(Y_{t+2}Y_{t+1}^1|Y_{1:t})$
+\subsection*{Hidden Markov Model}
+triplet $M = (\Sigma, Q, \Theta)$\\
+$\Sigma$ symbols, $Q$ states, $\Theta=(A,E)$ transition and emission, $e_k(b)$ emission prob. $x_k \in Q, b \in \Sigma$
+\subsection*{Forward/Backward - Alternative}
+Goal: $P(x_t|s) \propto P(x_t,s) = P(s_{t+1:n}|x_t)P(x_t,s_{1:k})$
+\subsection*{Evaluation (Forward/Backward)}
+Transition A and emission E known. Sequence s given.\\
+Wanted: prob that s is generated by HMM.\\
+\textbf{Forward:}\\
+Wanted: $f_l(s_t) = P(x_t = l, s_{1:t})$\\
+$f_l(s_{t+1}) = e_l(s_{t+1})\sum_k f_k(s_t) a_{k,l}$,\\
+$f_l(s_1) = \pi_l e_l(s_1) \forall l \in Q$\\
+\textbf{Backward:}\\
+Wanted: $b_l(s_t) = P(s_{t+1:n}|x_t = l)$\\
+$b_l(s_t) = \sum_k e_k(s_{t+1}) b_k(s_{t+1}) a_{l,k}$,\\
+$b_l(s_n) = 1 \forall l \in Q$\\
+Complexity in time: $\mathcal{O}(|\Sigma|^2 \cdot T)$
+
+\subsection*{Decoding (Viterbi)}
+Given: Observation sequence  $O= \{O_1 O_2 \dots O_T \}$, $a_{ij} = P(q_{t+1} = S_j | q_t = S_i)$, $b_j(k)=P(v_k \text{at t} |q_t = S_j)$ \\
+Wanted: most likely path $Q = \{q_1,q_2,\ldots q_T\}$\\
+$\delta_t (i) $ best score along single path, at a time t, which accounts for the first t observations and ends in $S_i$\\
+$\delta_t (j) = max_{1 \leq i \leq N}[\delta_{t-1} (i)a_{ij}]b_j(O_t) $\\
+$\phi_t(j)=argmax_{1\leq i \leq N} [\delta_{t-1}(i)a_{ij}]$\\
+Time: $\mathcal{O}(|S|^2 \cdot T)$
+Space $\mathcal{O}(|S| \cdot T)$
+
+\subsection*{Decoding (Viterbi) - Alternative}
+Transition $a_{i,j} = P(x_{t+1} = j |x_t = i)$ and emission $e_l(s_t) = P(s_t|x_t=l)$ known. Sequence s given.\\
+Wanted: Most likely path x responsible for the sequence.\\
+$v_l(s_{t+1}) = e_l(s_{t+1}) \max_k(v_k(s_t) a_{k,l})$\\
+$v_l(s_1) = \pi_l e_l(s_1) \forall l \in Q$\\
+Time: $\mathcal{O}(|\Sigma|^2 \cdot T)$, Space: $\mathcal{O}(|\Sigma| \cdot T)$
+\subsection*{Learning (Baum-Welch)}
+Know: Set of sequences $s^1,...,s^m$\\
+Wanted: max transition A and emission E\\
+\textbf{E-step I:} Compute all $f_k(s_t^j)$ (forward-algo.) \& $b_k(s_t^j)$ (backward algo.)\\
+\textbf{E-step II:} Compute $A_{kl}$, $E_k(b)$ for all states and symbols\\
+$A_{kl} = \sum_{j=1}^{m} \frac{1}{P(\textbf{s}^j)} \sum_{t=1}^{n}f_k^j (s_t^j)a_{kl}e_l(s_{t+1}^j)b_l^j(s_{t+1}^j)$\\
+$E_k(b)=\sum_{j=1}^{m}\frac{1}{P(\textbf{s}^j)}\sum_{t|S_t^j=b}^{n}f_k^j(s_t^j)b_k^j(s_t^j)$\\
+\textbf{M-step:} Compute param. estimates $a_{kl}$, $e_k(b)$\\
+$a_{kl}=\frac{A_{kl}}{\sum_{i=1}^{n}A_{ki}}$, $e_k(b)=\frac{E_k(b)}{\sum_{b'}E_k(b')}$\\
+Complexity: $\mathcal{O}(|\Sigma|^2)$ in storage (space).
diff --git a/src/aml-cheatsheet/1Regression.tex b/src/aml-cheatsheet/1Regression.tex
@@ -0,0 +1,92 @@
+% -*- root: Main.tex -*-
+\section{Regression}
+%\subsection*{Linear Regression}
+%Error: $\hat{R}(w) = \sum_{i=1}^n (y_i - w^Tx_i)^2 = ||Xw-y||^2_2$\\
+%Closed form: $w^*=(X^T X)^{-1} X^T y$\\
+%Gradient: $\nabla_w \hat{R}(w) = 2X^T (Xw-y)$
+\subsection*{Estimation}
+Consistency: $\hat{\theta_n} \stackrel{\text{\tiny P}}{\rightarrow} \theta$,
+i.e. $\forall\epsilon P \{|\hat{\theta_n}-\theta| \geq\epsilon\} \stackrel{\tiny n \to\infty}{\longrightarrow} 0 $\\
+Asymptotic normality: $\sqrt{N}(\theta - \hat{\theta_n}) \to \mathcal{N}(0, J^{-1}IJ^{-1})$ \\
+Asymptotic efficiency: $\hat{\theta_n}$ has the smallest variance among all possible consistent estimators (for large enough N), i.e. $\lim_{n\to\infty} (V[\hat{\theta_n}]I(\theta))^{-1} = 1$
+	$\hat{\theta}_{MAP} := \argmax_\theta \left \{ \sum_{i=1}^n log(p(x_i | \theta) + log(p(\theta)) \right\}$
+\subsection*{Rao-Cramer}
+$\Lambda = \frac{\partial \log \mathbb{P}(x|\theta )}{\partial x}$ (score function), $E[\Lambda ]=0$\\
+Fisher information: $I= \mathbb{V}[\Lambda]$ \\
+$J= E[\Lambda^{2}]= -E[\frac{\partial^2 \log \mathbb{P}(x|\theta ) }{\partial \theta \partial \theta ^{T}}]= -E[\frac{\partial \Lambda}{\partial \theta}]$ \\
+variance of an estimator is bounded from below by the inverse of Fisher information \\
+MSE bound: $E[(\hat \theta -\theta )^{2}] \geq \frac{[1 + b^{\prime} (\theta)]^{2}}{n E[\Lambda ^{2}]} + b_{\hat \theta}^{2}$ \\
+Biased estimators: $var(\hat{\theta}) \geq \frac{[1 + b^{\prime}(\theta)]^2}{I(\theta)}$ \\
+Efficiency: $e(\hat{\theta}) = \frac{I(\theta)^{-1}}{var(\hat{\theta})} \leq 1$ \\
+Cauchy-Schwarz: $|E(X,Y)|^2 \leq E(X)^2 E(Y)^2$ 
+
+\subsection*{Regularized regression}
+Error: $\hat{R}(w) = \sum \limits_{i=1}^n (y_i - w^Tx_i)^2 + \lambda ||w||_2^2$ (Ridge) \\
+Closed form: $w^*=(X^T X + \lambda I)^{-1} X^T y$ (Ridge)\\
+%Grad: $\nabla_w \hat{R}(w) = -2 \sum_{i=1}^n (y_i-w^T x_i) \cdot x_i + 2 \lambda w$\\
+{\small{} Shrinkage:} $Xw^*{=}\sum_{j=1}^{d} u_j\frac{\sigma_j^2}{\sigma_j^2+\lambda}u_j^Ty$, $X{=}U\Sigma V^T$ 
+LASSO: $w^* = \underset{w}{\operatorname{argmin}} \sum \limits_{i=1}^n (y_i - w^Tx_i)^2 + \lambda ||w||_1$
+
+\subsection*{Bayesian linear regression}
+	Model:  \= $y = X^T \beta + \epsilon$, with $\epsilon \sim
+	\mathcal{N}(\epsilon | 0, \sigma^2 I)$ or 
+	\> $P(y | X, \beta, \sigma) = \mathcal{N}(y | X^T \beta , \sigma^2 I)$ 
+	$P(\beta | \Lambda) = \mathcal{N} (\beta | 0, \Lambda^{-1})$, Post: $P(\beta | X, y, \Lambda) = \mathcal{N}(\beta | \mu_\beta, \Sigma_\beta)$ 
+	$\mu_\beta = (X^T X + \sigma^2 \Lambda)^{-1} X^T y$, $\Sigma_\beta = \sigma^2(X^T X + \sigma^2 \Lambda)^{-1}$ 
+	Prediction: \> $y_{new} = \hat{\beta}_{\scaleto{MAP}{4pt}}^T x_{new} = \mu_\beta ^T x_{new}$ 
+	$P(y_{new} | x_{new}, X, y, \beta) 
+	= \mathcal{N}(\mu_\beta ^T x_{new}, \sigma^2 + x_{new}^T \Sigma_\beta x_{new})$
+
+\subsection*{Combination of Regression Models:}
+$\text{bias}[\hat{f}(x)] = \frac{1}{B} \sum_{i=1}^{B} \text{bias}[\hat{f}_i(x)]$\\
+Var$[\hat{f}(x)] = \frac{1}{B^2}\sum_i$ Var$[\hat{f}_i(x)]
++ \frac{1}{B^2}\sum_{i,j:i\neq j}$ Cov$[\hat{f}_i(x), \hat{f}_j(x)] \approx \frac{\sigma^2}{B}$
+% \subsection*{Smoothing Splines}
+% $RSS(f,\lambda) = \sum\limits_{i=1}^n (y_i - f(x_i))^2 + \lambda  \int (f''(x))^2dx$\\
+
+\subsection*{RSS Estimator}
+$\hat{\beta} \sim \mathcal{N}(\beta,(X^TX)^{-1}\sigma^2)$.
+%\textbf{Unbiasedness}: $\mathbb{E}[\hat{\beta}] = \mathbb{E}[(X^TX)^{-1}X^Ty] = (X^TX)^{-1}X^T\mathbb{E}[X\beta+\epsilon] = (X^TX)^{-1}(X^TX)\beta+X^T\mathbb{E}[\epsilon] = \beta + 0$
+%\textbf{Variance of} $a^T\hat{\beta}$: $\mathbb{V}(a^T(X^TX)^{-1}X^T(X\beta + \epsilon)) = \mathbb{V}(a^T\beta) + \mathbb{E}(a^T(X^TX)^{-1}X^T\epsilon\epsilon^TX(X^TX)^{-1}a) = \sigma^2 a^T(X^TX)^{-1}a$ 
+
+%\subsection*{Gauss-Markov Theorem}
+%For any linear estimator $\widetilde{\theta}=c^T\mathbf{y}$ that is unbiased for $a^T\beta$ it holds: $\mathbb{V}(a^T\hat{\beta}) \leq \mathbb{V}(c^T\mathbf{y})$\\
+%Proof: Let $c^T \mathbf{y} = a^T\hat{\beta} + a^T\mathbf{D}\mathbf{y} = a^T((\mathbf{X^TX})^{-1}\mathbf{X}^T + \mathbf{D})\mathbf{y}$ be an unbiased estimator of $a^T \beta$; then it follow $a^T \mathbf{DX}\beta = 0$ which implies $\mathbf{DX} = 0$.\\
+%$\mathbb{V}(c^T \mathbf{y}) = \mathbb{E}[(c^T \mathbf{y})^2]-\mathbb{E}(c^T \mathbf{y})^2 = c^T(\mathbb{E}\mathbf{y}\mathbf{y}^T - \mathbb{E}\mathbf{y}\mathbb{E}\mathbf{y}^T)c = \sigma^2 c^T c $
+%= $\sigma^2 \big( a^T ((\mathbf{X^T X})^{-1}\mathbf{X}^T + \mathbf{D}) (\mathbf{X}(\mathbf{X^T X})^{-1}+\mathbf{D}^T)a \big )$\\
+%= $\sigma^2 \big( a^T (\mathbf{X^T X})^{-1}a +\mathbf{DD^T}a \big )$
+%= $\mathbb{V}(a^T\hat{\beta}) + a^T \mathbf{DD^T}a \geq \mathbb{V}(a^T\hat{\beta})$ (note: $\mathbf{DD^T}$ is PSD)
+
+\subsection*{Bias vs. Variance}
+\setlength{\mathindent}{0cm}
+$
+\E_D\E_{X,Y}\left(\hat{f}(X)-Y\right)^2 = \\
+\E_D\E_X\left(\hat{f}(X) - \E(Y|X)\right)^2 + \E_{X,Y}\left(Y - \E(Y|X)\right)^2\\
+= \E_X \E_D\left(\hat{f}(X) - \E_D(\hat{f}(X))\right)^2 \text{(variance)}\\
++ \E_X\left(\E_D(\hat{f}(X)) - \E(Y|X)\right)^2 \text{(bias}^2)\\
++ \E_{X,Y}\left(Y - \E(Y|X)\right)^2 \text{(noise)}
+$\\
+%High bias can cause an algorithm to miss the relevant relations between features and target outputs (underfitting).\\
+%High variance can cause overfitting: modeling the random noise in the training data, rather than the intended outputs.
+
+% \subsection*{Gradient Descent}
+% 1. Start arbitrary $w_o \in \mathbb{R}$\\
+% 2. For $i$ do $w_{t+1} = w_t - \eta_t \nabla \hat{R}(w_t)$
+
+%\subsection*{Curse of Dimensionality}
+%To obtain a reliable estimate at a given regularity, the required number of samples grows exponentially with the dimension of the sample space.
+
+% \subsection*{Expected Error}
+% For generalization, minimize the expected error
+% $R(w) = \int P(x,y) (y-w^Tx)^2 \partial x \partial y$\\
+% $= \mathbb{E}_{x,y}[(y-w^Tx)^2]$
+
+
+\subsection*{Ridge Parametric to nonparametric}
+Ansatz: $w=\sum_i \alpha_i x$\\
+$w^* = \underset{w}{\operatorname{argmin}} \sum_i (w^Tx_i-y_i)^2 + \lambda ||w||_2^2$ = \\
+${\operatorname{argmin}}_{\alpha_{1:n}} \sum_{i=1}^n (\sum_{j=1}^n \alpha_j x_j^T x_i - y_i)^2 + \lambda \sum_i \sum_j \alpha_i \alpha_j (x_i^T x_j)$\\
+$= {\operatorname{argmin}}_{\alpha_{1:n}} \sum_{i=1}^n (\alpha^T K_i - y_i)^2 + \lambda \alpha^T K \alpha$\\
+$= {\operatorname{argmin}}_{\alpha} ||\alpha^T K -y||_2^2 + \lambda \alpha^T K \alpha$\\
+Closed form: $\alpha^* = (K+\lambda I)^{-1} y$\\
+Prediction: $y^*= w^{*T} x = \sum_{i=1}^n \alpha_i^* k(x_i,x)$
diff --git a/src/aml-cheatsheet/2Bayes.tex b/src/aml-cheatsheet/2Bayes.tex
@@ -0,0 +1,13 @@
+% -*- root: Main.tex -*-
+\section{Bayesian Methods}
+\subsection*{MLE}
+$\theta^* = \operatorname{argmax}_\theta P(y|x,\theta) $\\
+$= {\operatorname{argmax}}_\theta \prod_{i=1}^n P(y_i|x_i, \theta) \text{\quad (iid)}$\\
+$= {\operatorname{argmax}}_\theta \sum_{i=1}^n log P(y_i|x_i,\theta)$
+
+\subsection*{MAP}
+$w^* = \underset{w}{\operatorname{argmax}} P(w|x,y) = \underset{w}{\operatorname{argmax}} \frac{P(w|x) P(y|x,w)}{P(y|x)}$\\ 
+$=\underset{w}{\operatorname{argmax}} log P(w) + \sum_i log P(y_i|x_i,w) + const.$
+
+\subsection*{MLE = MAP}
+$n \rightarrow \infty$ or prior is uniformly distr.