notes.tex

\documentclass{article}
\usepackage{natbib}
\usepackage{algpseudocode}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{fullpage}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\Lik}{Lik}
\DeclareMathOperator*{\Diag}{Diag}

\begin{document}

Each section is a new lecture.

\section{Applications}
\begin{itemize}
\item slides on applications of machine learning.
\item calculus/programming quiz.
\end{itemize}

\section{Training data in supervised learning}

Training data: $D= \{(x_1,y_1), \dots, (x_n,y_n)\}$, given/known.

Inputs $x_i\in\mathcal X$, outputs
$y_i\in\mathcal y$.

For each example, what is $\mathcal X,\mathcal Y$? 
\begin{itemize}
\item Image classification
\item Image segmntation
\item Translation
\item Spam filtering
\end{itemize}

Usually $\mathcal X = \mathbb R^p$ (e.g. word counts in emails).

Notation. 
\begin{itemize}
\item Inputs $x\in\mathbb R^p$.
\item $p$ = dimension of each input.
\item $n$ = number of training examples.
\item Outputs $y\in\mathbb R$ for regression, $y\in\{0,1\}$ for binary
  classification.
\item $f:\mathcal X\rightarrow \mathcal Y$ is the prediction function
  we want to learn.
\item Regression function $f:\mathbb R^p\rightarrow\mathbb R$
\item Binary classification function $f:\mathbb R^p\rightarrow \{0,1\}$.
\end{itemize}

Geometric interpretation of regression, height son/father (linear
pattern), species versus temperature (non-linear pattern).

Draw residual $r_i$ on graphs as vertical line segments.

\paragraph{Test data.} $D'= \{ (x'_1,y'_1), \dots, (x'_m,y'_m)\}$, unknown.
Goal is generalization to new/test data: algo $\textsc{Learn}(D) = f$,
with $f(x'_i)\approx y'_i$ for all test data, i.e. minimize
\begin{equation}
  \text{Err}_{D'}(f) = \sum_{(x',y')\in D'} \ell[f(x'), y']
\end{equation}
where $\ell$ is a loss function:
\begin{itemize}
\item In binary classification we typically use the
  mis-classification-rate/0-1-loss $\ell[f(x),y]=I[c_f(x)!=y]$, where
  the binary classifier $c_f(x)=0$ if $f(x)<0.5$ and 1 otherwise.
\item In regression we use the squared error $\ell[f(x),y]=[f(x)-y]^2$.
\end{itemize}
Since $D'$ is unknown, we need to assume that $D,D'\sim \mathcal P$.

\paragraph{Nearest neighbors algorithm.} What is near?
$\textsc{Dist}:\mathcal X\times \mathcal X \rightarrow \mathbb R_+$ is
a non-negative distance function between two data points.

e.g. for $\mathcal X=\mathbb R^p$ we use
\begin{itemize}
\item L1 distance $\textsc{Dist}(x,x') = ||x-x'||_1 = \sum_{j=1}^p |x_j-x_j'|$
\item L2 distance $\textsc{Dist}(x,x') = ||x-x'||_2 = \sqrt{
\sum_{j=1}^p (x_j-x_j')^2}$
\end{itemize}
Draw geometric interpretation of L1 and L2 distances.

Compute distances $d_1,\dots,d_n\in\mathbb R_+$ where 
$d_i=\textsc{Dist}(x_i,x)$.

Example graph, draw horizontal line segments $d_i$ between $x,x_i$.

Define neighbors function $N_{D,K}(x') = \{t_1, t_2, \dots, t_K\}$, where\\
$t_1,\dots,t_n\in\{1,\dots,n\}$ are indices such that\\
distances $d_{t_1}\leq \cdots \leq d_{t_n}$ are sorted ascending.

Predict the mean output value of the $K$ nearest
neighbors, $$f_{D,K}(x') = \frac 1 K \sum_{i\in N_{D,K}(x')} y_i$$

\begin{itemize}
\item For regression this is the mean.
\item For binary classification the mean is intepreted as a
  probability, predict 1 if greater than 0.5, and predict 0 otherwise.
\end{itemize}

\section{Nearest neighbors model selection and code}
\begin{itemize}
\item Review of supervised learning.
\item Definition of nearest neighbors prediction function $f_{D,K}(x)$.  
\item How to choose $K$? Discussion.
\item Formal definition of total error.
\item Visualizations. 1d regression and binary classification.
\item Draw train/validation matrices. Validation error.
\item Pseudocode for computing $f_{D,K}(x)$.
\end{itemize}

\begin{equation}
  \mathcal L_{D,D'}(k) = \sum_{(x',y')\in D'} \ell[f_{D,k}(x'), y']
\end{equation}

\begin{algorithmic}[1]
  \State Function $\textsc{PredKNearestNeighbors}$
  \State Inputs: train inputs/features $x_1,\dots,x_n$, outputs/labels $y_1,\dots,y_n$,\\ \ \ \ test input/feature $x'$, max number of neighbors $K_{\text{max}}$:
  \For{$i=1$ to $n$}:
  \State $d_i\gets \textsc{Distance}(x', x_i)$
  \EndFor
  \State $t_1,\dots,t_n\gets \textsc{SortedIndices}(d_1,\dots,d_n)$
  \State $\text{totalY}\gets 0.0$
  \For{$k=1$ to $K_{\text{max}}$}:
  \State $i\gets t_k$
  \State $\text{totalY} \texttt{ += } y_i$
  \EndFor
  \State Output: prediction $\text{totalY}/K_{\text{max}}$
\end{algorithmic}
Total complexity: $O(np + n\log n)$ where the Distance sub-routine is $O(p)$.

\section{Cross-validation}

Interactive data viz.

Remember we assume that training data and test data are randomly drawn
from a common probability distribution $\mathcal P$. To simulate that
process, we randomly assign a fold ID for each observation
$z_1,\dots,z_n\in\{1,\dots, S_\text{max}\}$. We then define
$\text{NumFolds}$ train/validation splits
\begin{equation}
  D_{z=s} = \{(x_i,y_i)|z_i = s\} \text{ --- validation set $s$}
  D_{z\neq s} = \{(x_i,y_i)|z_i \neq s\} \text{ --- train set $s$}
\end{equation}
Draw train/validation/test matrices for $s=1$, $s=2$, etc.

Cross-validation data visualizations.

Mean validation error over all train/validation splits (folds):
\begin{equation}
  \text{MeanVerr}(k) = \mathcal L_{D_{z\neq s},D_{z=s}}(k)
\end{equation}

Overall we choose the parameter $\hat k = \argmin_k \text{MeanVerr}(k)$.

And the learning algorithm is $\textsc{LearnKNN}(D) = f_{D, \hat k}$.

\subsection{Pseudocode}

We assume there are sub-routines
\begin{itemize}
\item
  $\textsc{Pred1toKmaxNN}(D, x', K_{\text{max}}) = [ f_{D, 1}(x')
  ... f_{D, K_{\text{max}}}(x') ]$ -- all predictions from $k=1$ to
  $K_{\text{max}}$ neighbors for a single test point $x'$ based on the training
  data set $D$.
\item $\textsc{PredError}(D, D', K_{\text{max}})$ trains on $D$ and
  computes $\mathcal L_{D,D'}(k)$, the validation error wrt $D'$ for
  $k=1$ to $K_{\text{max}}$ neighbors.
\end{itemize}

Then the pseudo code for the learning algo is:

\begin{algorithmic}[1]
  \State Function $\textsc{LearnKNN}$
  \State Inputs: train data $D$ ($n$ observations in $p$ dimensions), 
  number of folds $S_{\text{max}}\in\{1,\dots, n\}$, 
  max number of neighbors $K_{\text{max}}\in\{1, \dots, n\}$.
  \State Randomly assign fold IDs $z\in\{1,\dots,S_{\text{max}}\}^n$
  \For{each fold $s=1$ to $S_{\text{max}}$}:
  \State $E_s\gets \textsc{PredError}(D_{z\neq s}, D_{z=s}, K_{\text{max}})\in\mathbb R^{K_{\text{max}}}$
  \EndFor
  Let
  $E\gets[E_1 ... E_{S_{\text{max}}}]\in\mathbb R^{K_{\text{max}}\times
    S_{\text{max}}}$ be the matrix of error/loss values for all
  folds/columns and neighbors/rows.
  \State $\text{MeanVerr}\gets \textsc{colMeans}(E)\in\mathbb R^{K_{\text{max}}}$
  \State The optimal number of neighbors is $\hat k = \argmin_k \text{MeanVerr}_k$
  \State return $f_{D,\hat k}$
\end{algorithmic}

\section{R package development}

Choose groups.

\section{Coding nearest neighbors prediction}

For one test point, demo code in C++.

\section{Linear regression / gradient descent}

Want to minimize squared error on test data, but we don't have them,
so instead we do
\begin{equation}
  \min_f \sum_{i=1}^n [f(x_i)-y_i]^2
\end{equation}
We parameterize a linear function using a weight vector $w\in\mathbb R^p$,
$f_w(x_i)=w^T x_i$. 

The input feature matrix is 
\begin{equation}
  X = \left[\begin{array}{c}
              x_1^T \\
              \vdots \\
              x_n^T
\end{array}\right]\in\mathbb R^{n\times p}
\end{equation}
So for a partcular weight vector $w$ the prediction vector is
\begin{equation}
  Xw = \left[\begin{array}{c}
              x_1^T w \\
              \vdots \\
              x_n^T w
\end{array}\right]\in\mathbb R^{n}
\end{equation}
So the least squares regression problem is
\begin{equation}
  \min_{w\in\mathbb R^p} || Xw - y||_2^2
\end{equation}
Notation about matrix/vector dimensions.

What is a norm $||\cdot||$ = size of a vector.

Squared L2 norm $||v||_2^2 = v^T v$.

L1 norm $||v||_1 = \sum_{j=1}^p |v_j|$.

Univariate linear regression: $p=2$, $x_i=[1\ x_{i,2}]$.
\begin{equation}
  X = \left[\begin{array}{cc}
              1 & x_{1,2} \\
              \vdots & \vdots \\
              1 & x_{n,2}
\end{array}\right]\in\mathbb R^{n\times 2}
\end{equation}
and
$f(x_i) = w^T x_i = \underbrace{w_1}_{\text{intercept}} +
\underbrace{w_2}_{\text{slope}} x_{i,2}$. Draw figure on board then
show data viz.

Gradient is defined for any $w$ as the direction of steepest ascent:
\begin{equation}
  \nabla \mathcal L(w) = \left[\begin{array}{c}
              \frac{d}{dw_1} \mathcal L(w)\\
              \vdots \\
              \frac{d}{dw_p} \mathcal L(w)\\
\end{array}\right]\in\mathbb R^{p}
\end{equation}
For $p=1$ we have 
\begin{equation}
  \nabla \mathcal L(w) = \mathcal L'(w) = \frac{d}{dw}\mathcal L(w) = 
  \sum_{i=1}^n 2x_i(w x_i - y_i).
\end{equation}
In general we have
\begin{equation}
  \nabla \mathcal L(w) = 
  \nabla_w ||Xw - y||_2^2 = 
  2X^T(Xw-y).
\end{equation}
Algorithm: $w^{(0)}=0$ be the initial guess.
Then for any stepsize $\alpha>0$ define update rules for any iteration $t>0$:
\begin{equation}
  w^{(t)} = w^{(t-1)} - \alpha \nabla \mathcal L(w^{(t-1)}).
\end{equation}
Example: gradient descent on $\mathcal L(w)=(x+5)^2$. Start at the
origin. Compute gradient, first step.

\section{Probabilistic interpretation of linear regression}

Definitions of argmin, min.
\begin{equation*}
  \argmin_{w\in\mathbb R^p}\mathcal L(w) = \{w^*\in\mathbb R^p|\mathcal L(w^*)\leq L(w_0) \forall w_0\in\mathbb R^p\}.
\end{equation*}
Draw quadratic function $x^2$ and $x^2+1$ which have the same argmin
but different min.

Definition of argmax: draw upside down quadratic function $-x^2$ to
show relationship between argmin and argmax.

Why do we need to do this derivation? 
\begin{enumerate}
\item For binary classification can't minimize zero-one loss via gradient
descent.
\item Using $w^T x_i\in\mathbb R$ directly for binary classification
  does not make sense.
\end{enumerate}

Begin by re-deriving the least squares problem, assuming labels are
normally distributed.

Let $y_i\sim N(w^T x_i, s^2)$, draw probability density function.
\begin{equation*}
  \Pr(y_i, w^T x_i, s^2) = (2\pi s^2)^{-1/2} \exp\left[
\frac{-1}{2s^2}(y_i-w^Tx_i)^2
\right]
\end{equation*}
Let the likelihood of $w$ be
\begin{equation*}
  \Lik(w) = \prod_{i=1}^n \Pr(y_i, w^T x_i, s^2)
\end{equation*}
Then the log-likelihood is
\begin{equation*}
  \log\Lik(w) = \sum_{i=1}^n \log\Pr(y_i, w^T x_i, s^2)
\end{equation*}
Then we have 
\begin{eqnarray*}
  \argmax_w \log\Lik(w) 
&=& \argmax_w \sum_{i=1}^n \underbrace{
\frac{-1}{2}\log(2\pi s^2)
}_{\text{constant wrt $w$}} -
\frac{1}{2s^2}(y_i-w^Tx_i)^2\\
&=& \argmax_w -\sum_{i=1}^n
\underbrace{
\frac{1}{2s^2}
}_{\text{cst}}(y_i-w^Tx_i)^2\\
&=& \argmin_w \sum_{i=1}^n
(y_i-w^Tx_i)^2\\
\end{eqnarray*}

For binary labels assume Bernoulli distribution. Exercise: Derive
logistic loss, assuming $\tilde y_i\in\{-1,1\}$.

\section{Logistic regression}

Assume $y_i\sim \text{Bernoulli}(p_i)$ with
$p_i = \sigma(w^T x_i)\in(0,1)$ and
\begin{equation*}
  \sigma(z) = \frac{1}{1+e^{-z}}.
\end{equation*}
The probability mass function is
\begin{equation*}
  \Pr(y_i, p_i) =
  \begin{cases}
    p_i & \text{ if } y_i=1\\
    1-p_i & \text{ if } y_i=0.
  \end{cases}
\end{equation*}
The log-likelihood is then
\begin{eqnarray*}
  \log\Lik(w) &=& \sum_{i=1}^n\log \Pr(y_i, \sigma(w^T x_i))\\
&=& \sum_{i=1}^n \log[\sigma(w^T x_i)] I(y_i=1) + \log[1-\sigma(w^T x_i)] I(y_i=0)\\
&=& \sum_{i=1}^n \log[\frac{1}{1+\exp(-w^T x_i)}] I(y_i=1) + 
\log[\underbrace{1-\frac{1}{1+\exp(-w^T x_i)}}_{
\frac{\exp(-w^T x_i)}{1+\exp(-w^T x_i)}
}] I(y_i=0)\\
&=& \sum_{i=1}^n \log[\frac{1}{1+\exp(-w^T x_i)}] I(\tilde y_i=1) + 
\log[\frac{1}{1+\exp(w^T x_i)}] I(\tilde y_i=-1)\\
&=& \sum_{i=1}^n \log[\frac{1}{1+\exp(-\tilde y_i w^T x_i)}]\\
&=& \sum_{i=1}^n -\log[1+\exp(-\tilde y_i w^T x_i)]
\end{eqnarray*}
where the alternate labels are 
\begin{equation*}
  \tilde y_i =
  \begin{cases}
    -1 & \text{ if } y_i=0\\
    1 & \text{ if } y_i=1.
  \end{cases}
\end{equation*}
The logistic loss function that we want to minimize is thus
\begin{equation*}
  \mathcal L(w) = -\log\Lik(w) = \sum_{i=1}^n 
\underbrace{\log[1+\exp(-\tilde y_i w^T x_i)]}_{
\ell[w^T x_i, \tilde y_i]
}.
\end{equation*}
The gradient is
\begin{equation*}
\nabla_w \ell[w^T x_i, \tilde y_i] = \frac{
  -\tilde y_i x_i\exp(-\tilde y_i w^T x_i)
}{
  1+\exp(-\tilde y_i w^T x_i)
}=\frac{
  -\tilde y_i x_i
}{
  1+\exp(\tilde y_i w^T x_i)
}
\end{equation*}
Draw the logistic loss for $y_i=1$ as a function of $w^T x_i$. 
\begin{eqnarray*}
\ell[w^T x_i, \tilde y_i] &=& \log[1+\exp(- w^T x_i)] \\
\nabla_{w^T x_i} \ell[w^T x_i, \tilde y_i] &=& \frac{
  - 1
}{
  1+\exp( w^T x_i)
}
\end{eqnarray*}
\begin{itemize}
\item As $w^T x_i\rightarrow \infty$ we have
  $\exp(-w^T x_i)\rightarrow 0$ and
  $\log [1+\exp(- w^T x_i)]\rightarrow 0$.
\item As $w^T x_i\rightarrow -\infty$ we have
  $\exp(-w^T x_i)\rightarrow \infty$,
  $\ell[w^T x_i, \tilde y_i] = \log [1+\exp(- w^T x_i)]\rightarrow \infty$,
  $\exp(w^T x_i)\rightarrow 0$, and $\nabla_{w^T x_i} \ell[w^T x_i, \tilde y_i] \rightarrow -1$.
\end{itemize}


The gradient $\nabla\mathcal L:\mathbb R^p\rightarrow\mathbb R^p$ is
\begin{equation*}
\nabla \mathcal L(w) = \sum_{i=1}^n \nabla_w \ell[w^T x_i, \tilde y_i] = 
\sum_{i=1}^n \frac{
  -\tilde y_i x_i
}{
  1+\exp(\tilde y_i w^T x_i)
} = -X^T \tilde Y S(\tilde Y X w),
\end{equation*}
where $\tilde Y=\Diag(\tilde y)$ and $S:\mathbb R^n\rightarrow \mathbb R^n$ is the componentwise application
of the logistic link function,
$S(v) = \left[
    \begin{array}{ccc}
      \sigma(v_1) & \cdots & \sigma(v_n)
    \end{array}
\right]^T$.

Discuss gradient descent algorithm, same as least squares regression.

Discuss learning algorithm which chooses the number of steps (early
stopping) that minimizes the mean validation loss from CV.

\section{L2 regularization}

Define the cost function
\begin{equation}
  \label{eq:cost}
  C_\lambda(w) = \mathcal L(w) + \lambda||w||_2^2.
\end{equation}
\begin{itemize}
\item The loss $\mathcal L(w)$  encourages fitting the train data.
\item The squared L2 norm $||w||_2^2$ encourages a regularized model.
\item $\lambda\geq 0$ is a penalty constant (larger for more regularization).
\end{itemize}

For a given penalty $\lambda$ the L2 regularized linear model is
defined as $f_w(x)= w^T x$, using $f_{w^\lambda}$, where $w^\lambda$ is the
weight vector with minimal cost:
\begin{equation}
  w^\lambda = \argmin_{w\in\mathbb R^p} C_\lambda(w).
\end{equation}

The learning algorithm then looks like:
\begin{itemize}
\item For each train/validation split do:
\item For $\lambda\in\{\lambda_1, \dots, \lambda_{m}\}$ compute the
  optimal weight vector $w^\lambda$, where $m$ is the number of
  penalty values/models considered. Combine them into a matrix
  $W\in\mathbb R^{p\times m}$.
\item Compute prediction matrix $\hat Y = XW\in\mathbb R^{n\times m}$.
\item Compute average loss on this validation set,
  $\mathcal L(\hat Y, Y)$.
\item Let $\hat \lambda = \argmin_\lambda \text{MeanValidationLoss}(\lambda)$.
\item Use $f_{\hat \lambda}$ for the final predictions on test data.
\end{itemize}

To compute $w^\lambda$ we need the gradient:
\begin{equation}
  \label{eq:L2-reg-gradient}
  \nabla C(w) = \nabla \mathcal L(w) + 2\lambda w.
\end{equation}
The gradent of the loss $\nabla \mathcal L$ is the same as we computed
earlier for the early stopping model.

In the statistics literature we usually parameterize the linear
function $f_{\beta,w}(x)=\beta + x^T w$ with an additional
intercept/bias term $\beta\in\mathbb R$, which is un-penalized:
\begin{equation}
  \label{eq:cost-bias}
  C_\lambda(\beta, w) = \mathcal L(\beta,w) + \lambda||w||_2^2.
\end{equation}
Then we need to compute the gradient with respect to both variables,
$\nabla_\beta C(\beta, w), \nabla_w C(\beta, w)$.

\section{Scaling}

The gradient descent will only work if the inputs are scaled. Ex input
matrix with height in mm, weight in kg, shoe size.

We start with an unscaled input matrix $X\in\mathbb R^{n \times p}$
and we want to do gradient descent on a scaled input matrix
$\tilde X\in\mathbb R^{n\times \tilde p}$, where $\tilde p\leq p$ is
fewer features than we started out with, because some with zero
variance may be filtered before doing gradient descent.

To define the scaled features we need to compute the mean $m_j$ and
standard deviation $s_j$ of each column $j\in\{1,\dots, p\}$ of $X$:
\begin{eqnarray*}
  m_j &=& \frac 1 n \sum_{i=1}^n x_{ij}\\
  s_j &=& \sqrt{\frac 1 n \sum_{i=1} (x_{ij} - m_j)^2}
\end{eqnarray*}
We define the set of features with non-zero variance in the train set as
$\mathcal J = \{j|s_j>0\}$. The size of this set is
$\tilde p = |\mathcal J|$, the number of features/columns in the
scaled data matrix. Let $x_{\mathcal J}\in\mathbb R^{\tilde p}$ be the
unscaled feature vector, omitting the entries/features which have zero
variance in the train set.

Then let $s =\left[
\begin{array}{ccc}
  s_1&\cdots & s_p
\end{array}
\right]^T\in\mathbb R^p$ be the vector of standard deviations for all
features, and let $\tilde s =\left[
\begin{array}{ccc}
  \tilde s_1&\cdots & \tilde s_{\tilde p}
\end{array}
\right]^T = s_{\mathcal J}\in\mathbb R^{\tilde p}$ be the standard
deviations for all features with non-zero variance in the training
set. The diagonal scaling matrix is
\begin{equation}
  \tilde S^{-1} = \text{Diag}(\tilde s)^{-1}=\left[
    \begin{array}{ccc}
      1/\tilde s_1& 0&0\\
      0 & \ddots & 0\\
      0 & 0 & 1/\tilde s_{\tilde p}
    \end{array}
\right]\in\mathbb R^{\tilde p\times \tilde p}
\end{equation}
Then for any unscaled feature vector $x\in \mathbb R^p$ we define the
corresponding scaled feature vector as 
\begin{equation}
  \label{eq:scaled-feature}
  \tilde x = \tilde S^{-1} (x_{\mathcal J} - m_{\mathcal J}).
\end{equation}

So the linear model on a scaled feature vector
$\tilde x\in\mathbb R^{\tilde p}$ is
\begin{equation}
  \label{eq:scaled-prediction}
  \tilde f_{\tilde w}(\tilde x)=\tilde w^T \tilde x,
\end{equation}
 where
$\tilde w\in\mathbb R^{\tilde p}$ is the scaled weight vector.
The cost we want to minimize in gradient descent is then
\begin{equation}
  \label{eq:scaled-cost}
  \tilde C_\lambda( \tilde w) = \tilde{ \mathcal L}(\tilde w) + \lambda ||\tilde w||_2^2.
\end{equation}
Gradient descent gives us an optimal scaled weight vector
$\tilde w^\lambda = \argmin_{\tilde w\in\mathbb R^{\tilde p}} \tilde
C_\lambda(\tilde w)$. To make a prediction using a new unscaled test
data point $x\in\mathbb R^p$ we
\begin{itemize}
\item scale it via (\ref{eq:scaled-feature}), which gives us
  $\tilde x\in\mathbb R^{\tilde p}$.
\item compute the inner product with the learned weight vector (\ref{eq:scaled-prediction}).
\end{itemize}

It is equivalent (and more convenient for the user) to return the
weight vector $w$ in the original (unscaled) space:
\begin{equation}
  \tilde f_{\tilde w}(\tilde x) = \tilde x^T \tilde w = 
(x_{\mathcal J}-m_{\mathcal J})^T \tilde S^{-1}\tilde w = 
x_{\mathcal J} \underbrace{\tilde S^{-1} \tilde w}_w
\underbrace{-m_{\mathcal J} \tilde S^{-1} \tilde w}_{\beta}.
\end{equation}
which implies that the prediction function $f_{\beta,w}(x)$ for an
unscaled input/feature vector $x\in\mathbb R^p$ is defined by the
bias/intercept $\beta\in\mathbb R$ and weight vector $w$ as
above. Actually there is a slight abuse of notation: the equation
above only defines the weights for the features $j\in\mathcal J$ with
non-zero variance in the train data. The other features
$j\not\in\mathcal J$ with zero variance in the train data should have
zero weight in the final weight vector $w$.

\section{Exact line search}

Gradient descent is for finding a weight vector $w\in\mathbb R^p$
which minimizes a smooth cost function $C(w)$.

We start the algorithm at the origin, $w^{(0)}=0$.

For each iteration $t$ we compute the descent direction 
\begin{equation}
  \label{eq:descent-direction}
  d^{(t)} = -\nabla C(w^{(t)})
\end{equation}
Then we define the next iteration by taking a step in that direction,
\begin{equation}
  \label{eq:gradient-step}
  w^{(t+1)} = w^{(t)} + \alpha^{(t)} d^{(t)},
\end{equation}
where $\alpha^{(t)}>0$ is an iteration-specific step size parameter.

For line search the main idea is that we want to choose a step
size that minimizes the cost:
\begin{equation}
  \label{eq:line-search-cost}
  \mathcal C_t(\alpha) = C(w^{(t)} + \alpha d^{(t)}).
\end{equation}
For exact line search we take the best possible step,
\begin{equation}
  \label{eq:exact-line-search}
  \alpha^{(t)} = \argmin_\alpha \mathcal C_t(\alpha).
\end{equation}

Example: a function with a minimum at (-1, 2).
\begin{equation}
  C(w) = 0.5(w_1 +1)^2 + 0.5 (w_2-2) = 0.5||w + \left[
    \begin{array}{cc}
      1&-2
    \end{array}
\right]^T ||_2^2.
\end{equation}
Derive the exact line search step.

The exact line search step can be derived for the L2-regularized
linear model for regression.
\begin{eqnarray}
  \mathcal C_t(\alpha) &=& 0.5||X(w^{(t)} + \alpha d^{(t)}) -y||_2^2 +
 0.5\lambda||w^{(t)} + \alpha d^{(t)}||_2^2\\
  \mathcal C'_t(\alpha) &=& d^{(t) T} X^T [X(w^{(t)} + \alpha d^{(t)}) -y] +
\lambda d^{(t) T} [w^{(t)} + \alpha d^{(t)}].
\end{eqnarray}
Setting the derivative equal to zero and solving for $\alpha$ yields
\begin{equation}
  \alpha^{(t)} = \argmin_\alpha \mathcal C_t(\alpha) = 
\frac{
d^{(t) T} [ \overbrace{-X^T(Xw^{(t)}-y)}^{d^{(t)}} - \lambda w^{(t)} ]
}{
d^{(t) T} (X^T X + \lambda I_p) d^{(t)}
} = \frac{
||d^{(t)}||_2^2
}{
|| X d^{(t)} ||_2^2 + 
\lambda ||d^{(t)}||_2^2
}
\end{equation}
Is it any slower than the constant gradient step? We are already computing
the gradient / descent direction,
\begin{equation}
  d^{(t)} = -\nabla C(w^{(t)}) = -X^T(X w^{(t)} - y).
\end{equation}
\begin{itemize}
\item Computing each element of $X w^{(t)}\in\mathbb R^n$ is an $O(p)$
  operation, so overall time is $O(np)$.
\item The next operation, subtracting $y$ is $O(n)$.
\item For the next operation, left-multiply by $-X^T$, computing each
  of the $p$ elements is $O(n)$, so overall time is $O(np)$.
\item Overall computation of $d^{(t)}$ is thus $O(np)$.
\end{itemize}
Computing the largest component of $\alpha^{(t)}$, $||Xd^{(t)}||_2^2$,
is also $O(np)$ so there is no asymptotic difference in time
complexity between constant and exact line search (for this problem).

\section{Backtracking line search}

Backtracking line search is more general than exact line search, in
the sense that it can be used on more cost functions. It performs
approximate/numerical solution of the optimal step size (rather than
the analytic solution used by exact line search).

The main idea is to start by trying a big $\alpha=1$. If the cost
increases then the step is too big, so decrease the step by a
reduction factor $\rho\in(0,1)$. Keep decreasing the step size until
the cost decreases (it is guaranteed to when $d$ is a descent
direction). In practice the range of $\rho$ values is from 0.1 (crude
search) to 0.8 (more exhaustive search).

In practice the simple rule above is not used, because it may result
in a sequence of steps with very small decreases in cost. Instead we
introduce a parameter $\gamma\in(0,0.5)$ which controls how much
progress/decrease is significant/acceptable. 
We want $\alpha$ such that

\begin{equation}
  \mathcal C_t(\alpha) = C(w^{(t)} + \alpha d^{(t)}) < 
C(w^{(t)} + \gamma \alpha \nabla C(w^{(t)})^T d^{(t)})= \mathcal A_{t}^\gamma(\alpha)
\end{equation}

When $\gamma=0$ then a small decrease in cost is acceptable; when
$\gamma=0.5$ then only a very large decrease is acceptable. In
practice $\gamma$ is usually between 0.01 and 0.3 \citep[page 466]{Boyd2004}.

The approximate cost $\mathcal A_{t}^\gamma(\alpha)$ for $\gamma=1$ is
a linear approxmation of the cost around $w^{(t)}$, which
underestimates the cost since it is a convex function. By using a
$\gamma<0.5$ we get a linear approximation between the min cost and
min decrease (draw plot).

\section{Neural networks}

We follow the presentation/notation of \citet[section~16.5]{Murphy2012}.

Draw neural network diagrams for logistic regression and e.g. ($p$, 5, 10, 1) network has two hidden layers and one output.

\section{Backpropagation algorithm for single layer regression network}

$$
x \rightarrow^V a \rightarrow^\sigma z \rightarrow^w b
$$

Overall prediction function is 

$$
f(x_i) = b_i = w^T z_i = w^T \sigma(a_i) = w^T S(V^T x_i)
$$

Feature matrix is $X\in\mathbb R^{n\times p}$. Feature vector for one observation is $x_i\in\mathbb R^p$. One feature of that observation is $x_{ij}\in\mathbb R$.

Labels are $y\in\mathbb R^n$ for regression. One label is $y_i\in\mathbb R$.

Hidden layer vector is $z_i = S(a_i) = S(V^T x_i) \in\mathbb R^u$. One hidden unit is $z_{ik}=\sigma(a_{ik})\in\mathbb R$.

Indices are $i\in\{1,\dots, n\}$ for observations/examples, $j\in\{1,\dots,p\}$ for features, and $k\in\{1,\dots, u\}$ for hidden units.

Weight matrix for first layer is $V\in\mathbb R^{p\times u}$. Weight vector for predicting hidden unit $k$ is $v_k\in\mathbb R^p$.

Weight vector for predicting output is $w\in\mathbb R^u$.

Overall loss function that we want to minimize is
$$
\mathcal L(w, V) = \frac 1 n \sum_{i=1}^n \frac 1 2 [ f(x_i) - y_i]^2 = \frac 1 n \sum_{i=1}^n \frac 1 2 [ w^T S(V^T x_i) - y_i]^2 = \frac{1}{2n} || S(XV) w - y||_2^2
$$

The learning algorithm is as before with linear models:
start at $w,V=0$ (or some random values close) and then take steps in the opposite direction of the gradient. 
We therefore need to compute the gradients of $\mathcal L$ with respect to the parameters $w,V$.
We consider the gradient with respect to a single observation $i$:
$$
\nabla_w \frac 1 2 [ f(x_i) - y_i ]^2 = 
\underbrace{
  \frac{\partial}{\partial b_i} \frac 1 2 [b_i -y_i]^2
}_{
  b_i - y_i = \delta^w_i
}
\underbrace{
  \nabla_w \underbrace{
    b_i
  }_{
    w^T z_i
  }
}_{
  z_i
}
= \delta^w_i z_i
$$
And we consider the gradient of the weights $v_k\in \mathbb R^p$ used to predict hidden unit $k$:
\begin{eqnarray*}
\nabla_{v_k} \frac 1 2 [ f(x_i) - y_i]^2 
&=& \underbrace{
  \frac{\partial}{\partial a_{ik}} \frac 1 2 [ w^T S(a_i) - y_i ]^2
}_{
  \delta^v_{ik}
} \nabla_{v_k} \underbrace{
  a_{ik}
}_{
  v_k^T x_i
} = \delta_{ik}^v x_i \\
&=& \underbrace{
  [ \frac{\partial}{\partial b_i} \frac 1 2 ( b_i - y_i )^2 ]
}_{
  \delta^w_i
}
\underbrace{
  [ \frac{\partial}{\partial a_{ik}} b_i ]
}_{
  w_k \sigma'(a_{ik})
}
x_i\\
\end{eqnarray*}
The first layer errors $\delta_i^v = \left[\begin{array}{ccc}
\delta_{i1}^v & \cdots & \delta_{iu}^v
\end{array}\right]\in\mathbb R^u$ can thus be written in terms of the second layer errors
$\delta_i^w\in\mathbb R$.

Therefore the gradient of the full first layer weight matrix $V$ with respect to one observation is
$$
\nabla_V \frac 1 2 [ f(x_i) - y_i ]^2 = x_i \delta_i^{vT}
$$

The algorithm is to compute, in order:

\begin{tabular}{lll}
quantity & one observation & batch \\
\hline
hidden before sigmoid & $a_i = V^T x_i \in \mathbb R^u$ & 
  $A = XV\in\mathbb R^{n\times u}$ \\
hidden after sigmoid & $z_i = S(a_i)\in\mathbb R^u$ & 
  $Z = S(A)\in\mathbb R^{n\times u}$\\
predictions & $b_i = w^T z_i \in \mathbb R$ & 
  $b = Z w\in \mathbb R^n$\\
second level errors & $\delta_i^w =  b_i - y_i \in\mathbb R$ & 
  $\delta^w = b-y\in\mathbb R^n$\\
first level errors & $\delta_i^v = \delta_i^w \Diag(w) S'(a_i)\in\mathbb R^u$ & 
  $\delta^v = \Diag(\delta^w) S'(A) \Diag(w)\in\mathbb R^{n\times u}$\\
second level gradient & $\nabla_w \frac 1 2 [f(x_i)-y_i]^2 = \delta_i^w z_i\in\mathbb R^u$ &
  $\nabla_w \mathcal L(w, V) = \frac 1 n \sum_{i=1}^n \delta_i^w z_i = Z^T \delta^w/n\in\mathbb R^u$\\
first level gradient & $\nabla_V \frac 1 2 [f(x_i)-y_i]^2 = x_i \delta_i^{vT}\in\mathbb R^{p\times u}$ &
  $\nabla_V \mathcal L(w, V) = \frac 1 n \sum_{i=1}^n x_i \delta_i^{vT} = X^T \delta^v/n\in\mathbb R^{p\times u}$
\end{tabular}
Note that since $\sigma'(t)=\sigma(t)[1-\sigma(t)]$ the derivative of
the sigmoid can be computed as
\begin{equation}
  S'(a_i) = \Diag(z_i) (\mathbf 1_u - z_i)
\end{equation}

\section{Backpropagation for single layer binary classification network}

TODO

\bibliographystyle{abbrvnat}
\bibliography{refs}


\end{document}