CS 480/680: Introduction to Machine Learning
Spring 25
This is not a hard course.
\documentclass{article}
\usepackage{fullpage}
\usepackage{float}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{url, hyperref}
\usepackage{algorithm2e}
\usepackage[margin=0.25in]{geometry}
\usepackage{pgfplots}
\pgfplotsset{width=10cm,compat=1.18}
\usepgfplotslibrary{external}
\tikzexternalize
\lstset{basicstyle=\fontfamily{pcr}\footnotesize}
\graphicspath{./}
\newcommand{\paren}[1]{\left( #1 \right)}
\newcommand{\iprod}[1]{\left\langle #1 \right\rangle}
\newcommand*\MY@rightharpoonupfill@{%
\arrowfill@\relbar\relbar\rightharpoonup
}
\newcommand*\overrightharpoon{%
\mathpalette{\overarrow@\MY@rightharpoonupfill@}%
}
\theoremstyle{definition}
\newtheorem{question}{Question}
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator{\tr}{tr}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Var}{Var}
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
\newcommand{\llnorm}[1]{\left\| #1 \right\|_2}
\newcommand{\mnorm}[1]{\left\| #1 \right\|_1}
\newcommand{\fnorm}[1]{\left\| #1 \right\|_F}
\title{\large CS480/680, Spring 2025\\\huge Review Notes}
\author{Student: Hongxu Xu (h445xu@uwaterloo.ca)}
\date{\today}
\setlength\parindent{0pt}
%\input{preamble}
\begin{document}
\maketitle
\newpage
\part{For Mid-Term}
\section{Perceptron}
\begin{question}[Linear Function]
\begin{equation}
\begin{split}
& \forall \alpha \beta \in \mathbb{R}, \forall \boldsymbol{x}, \boldsymbol{z} \in \mathbb{R}^d,
f(\alpha \boldsymbol{x} + \beta \boldsymbol{z}) = \alpha \cdot f(\boldsymbol{X}) + \beta \cdot f(\boldsymbol{z}) \\
\Longleftrightarrow
& \exists \boldsymbol{w} \in \mathbb{R}^d, f(\boldsymbol{x}) = \iprod{\boldsymbol{x}, \boldsymbol{w}}
\end{split}
\end{equation}
\textbf{Proof $\Rightarrow$:}
Let $\boldsymbol{w} \coloneq \left[ f(\boldsymbol{e_1}), \dots, f(\boldsymbol{e_d}) \right]^T$, where
$\boldsymbol{e_i}$ is the $i$-th coordinate vector.
\begin{equation}
\begin{split}
f(\boldsymbol{x}) & = f( x_1 \boldsymbol{e_1} + \dots + x_d \boldsymbol{e_d} ) \\
& = x_1 f(\boldsymbol{e_1}) + \dots + x_d f(\boldsymbol{e_d}) \\
& = \iprod{\boldsymbol{x}, \boldsymbol{w}}
\end{split}
\end{equation}
\textbf{Proof $\Leftarrow$:}
\begin{equation}
\begin{split}
f(\alpha \boldsymbol{x} + \beta \boldsymbol{z}) & = \iprod{\alpha \boldsymbol{x} + \beta \boldsymbol{z}, \boldsymbol{w}} \\
& = \iprod{\alpha \boldsymbol{x}, \boldsymbol{w}} + \iprod{\beta \boldsymbol{z}, \boldsymbol{w}} \\
& = \alpha \iprod{\boldsymbol{x}, \boldsymbol{w}} + \beta \iprod{\boldsymbol{z}, \boldsymbol{w}} \\
& = \alpha f(\boldsymbol{x}) + \beta f(\boldsymbol{z})
\end{split}
\end{equation}
\end{question}
\begin{question}[$\boldsymbol{w}$ is Orthogonal to Decision Boundary $H$]
Any vector on $H$ can be written as $\overrightharpoon{\boldsymbol{x}\boldsymbol{x'}} = \boldsymbol{x'} - \boldsymbol{x}$,
\begin{equation}
\iprod{\boldsymbol{x'} - \boldsymbol{x}, \boldsymbol{w}}
= \iprod{\boldsymbol{x'}, \boldsymbol{w}} - \iprod{\boldsymbol{x}, \boldsymbol{w}}
= -b - (-b) = 0
\end{equation}
\end{question}
\begin{question}[Update Rule for Perceptron]
\begin{equation} \label{perceptron-update-rule}
\begin{split}
\boldsymbol{w} \leftarrow \boldsymbol{w} + y_i \boldsymbol{x_i} \\
b \leftarrow = b + y_i
\end{split}
\end{equation}
\end{question}
\begin{question}[Feasibility of Perceptron]
The goal is to find $\boldsymbol{w} \in \mathbb{R}^d, b \in \mathbb{R}$,
such that $\forall i, y_i \paren{\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b} > 0$.
According to the update rule \ref{perceptron-update-rule},
\begin{equation}
\begin{split}
y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{new}}} + b_{new} \right]
&= y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{old} + y \boldsymbol{x}}} + b_{old} + y \right] \\
&= y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{old}}} + b_{old} \right] + y\left[ \iprod{\boldsymbol{x} ,y \boldsymbol{x}} + y \right] \\
&= y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{old}}} + b_{old} \right] + y\left[ y \llnorm{\boldsymbol{x}}^2 + y \right] \\
&= y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{old}}} + b_{old} \right] + y^2 \llnorm{\boldsymbol{x}}^2 + y^2 \\
&= y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{old}}} + b_{old} \right] + \underbrace{\llnorm{\boldsymbol{x}}^2 + 1}_{\text{always positive}}
\end{split}
\end{equation}
Notice that $y \in \left\{ \pm 1 \right\} \Rightarrow y^2 = 1$.
$\llnorm{\boldsymbol{x}}^2 + 1$ is always positive, which means we always increase the confidence $y \hat{y}$ after the update.
\end{question}
\begin{question}[Trick for Hiding the Bias Term -- Padding]
\begin{equation}
\begin{split}
\iprod{\boldsymbol{x}, \boldsymbol{w}} + b
& = \iprod{ \begin{pmatrix} \boldsymbol{x} \\ 1 \end{pmatrix}, \begin{pmatrix} \boldsymbol{w} \\ b \end{pmatrix} } \\
& = \iprod{ \boldsymbol{x_{pad}}, \boldsymbol{w_{pad}}}
\end{split}
\end{equation}
Correspondingly, the update rule can be written as:
\begin{equation}
\boldsymbol{w_{pad}} \leftarrow \boldsymbol{w_{pad}} + y \boldsymbol{x_{pad}}
\end{equation}
\end{question}
\begin{question}[Margin]
Suppose $\exists \boldsymbol{w^*}$ such that $\forall i, y_i \iprod{\boldsymbol{x_i}, \boldsymbol{w^*}} > 0$.
We normalize $\boldsymbol{w^*}$ such that $\llnorm{\boldsymbol{w^*}} = 1$.
In other words, $\boldsymbol{w^*}$ is the normalized weight for the deicision boundary.
\begin{equation} \label{margin}
\text{Margin } \gamma \coloneq \min_i \left| \iprod{\boldsymbol{x_i}, \boldsymbol{w^*}} \right|
\end{equation}
\end{question}
\begin{question}[Convergence Theorem -- Linearly Separable Case]
Assume that $\forall i, \llnorm{\boldsymbol{x_i}} \leq C$ (i.e. within a circle with radius $C$).
Then the Perceptron algorithm converges after $\frac{C^2}{\gamma^2}$ mistakes.
\vspace{5mm}
\textbf{Proof:}
Suppose $\boldsymbol{w}$ is the updating weight, and $\theta$ is the angle between $\boldsymbol{w}$ and $\boldsymbol{w^*}$.
We have $\iprod{\boldsymbol{w}, \boldsymbol{w^*}} = \llnorm{\boldsymbol{w}} \llnorm{\boldsymbol{w^*}} \cos{\theta} = \llnorm{\boldsymbol{w}} \cos{\theta}$.
After an update, $\llnorm{\boldsymbol{w_{new}}} \cos{\theta_{new}}$ will be
\begin{equation}
\begin{split}
\iprod{\boldsymbol{w} + y \boldsymbol{x}, \boldsymbol{w^*}}
& = \iprod{\boldsymbol{w}, \boldsymbol{w^*}} + y \iprod{\boldsymbol{x}, \boldsymbol{w^*}} \\
& = \iprod{\boldsymbol{w}, \boldsymbol{w^*}} + \left|\iprod{\boldsymbol{x}, \boldsymbol{w^*}}\right| \\
& \geq \iprod{\boldsymbol{w}, \boldsymbol{w^*}} + \gamma
\end{split}
\end{equation}
Let's see the change of $\iprod{\boldsymbol{w_{new}}, \boldsymbol{w_{new}}} = \llnorm{\boldsymbol{w_{new}}}^2$,
\begin{equation}
\begin{split}
\iprod{\boldsymbol{w} + y \boldsymbol{x}, \boldsymbol{w} + y \boldsymbol{x}}
& = \iprod{\boldsymbol{w}, \boldsymbol{w}} + 2y \iprod{\boldsymbol{w}, \boldsymbol{x}} + y^2 \iprod{\boldsymbol{x}, \boldsymbol{x}} \\
& = \llnorm{\boldsymbol{w}}^2 + 2y \iprod{\boldsymbol{w}, \boldsymbol{x}} + \llnorm{\boldsymbol{x}}^2
\end{split}
\end{equation}
Because $y\iprod{\boldsymbol{w}, \boldsymbol{x}} < 0$ and $\llnorm{\boldsymbol{x}} \leq C$,
\begin{equation}
\begin{split}
\iprod{\boldsymbol{w} + y \boldsymbol{x}, \boldsymbol{w} + y \boldsymbol{x}}
& = \llnorm{\boldsymbol{w}}^2 + 2y \iprod{\boldsymbol{w}, \boldsymbol{x}} + \llnorm{\boldsymbol{x}}^2 \\
& \leq \llnorm{\boldsymbol{w}}^2 + C^2
\end{split}
\end{equation}
Finally, suppose it converges after $M$ updates, we have $\iprod{\boldsymbol{w}, \boldsymbol{w^*}} \geq M \gamma$ and $\llnorm{\boldsymbol{w}}^2 \leq M C^2$
\begin{equation}
\begin{split}
1 = \cos{\theta} & = \frac{\iprod{\boldsymbol{w}, \boldsymbol{w^*}}}{\llnorm{\boldsymbol{w}} \llnorm{\boldsymbol{w^*}}} \\
& \geq \frac{M \gamma}{\sqrt{M C^2} \times 1} \\
& = \sqrt{M} \frac{\gamma}{C}
\end{split}
\end{equation}
which means $M \leq \frac{C^2}{\gamma^2}$.
\end{question}
\begin{question}[Perceptron Loss]
\begin{equation}
\begin{split}
l(\boldsymbol{w}, \boldsymbol{x_t}, y_t)
& = -y_t \iprod{\boldsymbol{w}, \boldsymbol{x_t}} \mathbb{I}[\text{mistake on }\boldsymbol{x_t}] \\
& = -\min \left\{ y_t \iprod{\boldsymbol{w}, \boldsymbol{x_i}}, 0 \right\}
\end{split}
\end{equation}
\begin{equation}
L(\boldsymbol{w}) = -\frac{1}{n} \sum_{t=1}^n y_t \iprod{\boldsymbol{w}, \boldsymbol{x_t}} \mathbb{I} [\text{mistake on }\boldsymbol{x_t}]
\end{equation}
\end{question}
\section{Linear Regression}
\begin{question}[Least Square Regression]
\begin{equation}
\min_{f: \mathcal{X} \rightarrow \mathcal{Y}} \mathbb{E} \llnorm{f(\boldsymbol{X}) - Y}^2
\end{equation}
The optimal regression function is
\begin{equation}
f^*(\boldsymbol{x}) = m(x) = \mathbb{E} [Y | \boldsymbol{X} = \boldsymbol{x}]
\end{equation}
Calculating it needs to know the distribution, i.e., all pairs $(\boldsymbol{X}, Y)$.
\end{question}
\begin{question}[Bias-Variance Decomposition]
\begin{equation}
\begin{split}
\mathbb{E} \llnorm{f(\boldsymbol{X}) - Y}^2
& = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x) + m(x) - Y}^2 \\
& = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x)}^2 + \mathbb{E} \llnorm{m(x) - Y}^2 + 2 \mathbb{E} \iprod{f(\boldsymbol{X}) - m(x), m(x) - Y} \\
& = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x)}^2 + \mathbb{E} \llnorm{m(x) - Y}^2 + \mathbb{E} \; \mathbb{E}_{Y|\boldsymbol{X}} \left[ \iprod{f(\boldsymbol{X}) - m(x), m(x) - Y} \right] \\
& = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x)}^2 + \mathbb{E} \llnorm{m(x) - Y}^2 + \mathbb{E} \iprod{f(\boldsymbol{X}) - m(x), m(x) - \mathbb{E}_{Y|\boldsymbol{X}} [Y]} \\
& = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x)}^2 + \mathbb{E} \llnorm{m(x) - Y}^2 + \mathbb{E} \iprod{f(\boldsymbol{X}) - m(x), m(x) - m(x)} \\
& = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x)}^2 + \underbrace{\mathbb{E} \llnorm{m(x) - Y}^2}_{\text{noise (variance)}}
\end{split}
\end{equation}
The last term is the noise (variance), irrelevant to $f$.
So, to minimize the squared error, we need $f \approx m$.
However, $m(\boldsymbol{x})$ is incalculable, because $\mathbb{E} [Y | \boldsymbol{X} = \boldsymbol{x}]$ is unknown.
Let's learn $f_D$ from the training data $D$. Define $\bar{f}(\boldsymbol{X}) = \mathbb{E}_D[f_D(\boldsymbol{X})]$.
\begin{equation}
\begin{split}
\underbrace{\mathbb{E}_{\boldsymbol{X}, Y, D} \llnorm{f_D(\boldsymbol{X}) - Y}^2}_{\text{test error}}
& = \mathbb{E}_{\boldsymbol{X}} \llnorm{f_D(\boldsymbol{X}) - m(x)}^2 + \underline{\mathbb{E}_{\boldsymbol{X}, Y} \llnorm{m(x) - Y}^2} \\
& = \mathbb{E}_{\boldsymbol{X}, D} \llnorm{f_D(\boldsymbol{X}) - \bar{f}(\boldsymbol{X}) + \bar{f}(\boldsymbol{X}) - m(x)}^2 + \underline{\mathbb{E}_{\boldsymbol{X}, Y} \llnorm{m(x) - Y}^2} \\
& = \mathbb{E}_{\boldsymbol{X}, D} \llnorm{f_D(\boldsymbol{X}) - \bar{f}(\boldsymbol{X})}^2 + \mathbb{E}_{\boldsymbol{X}} \llnorm{\bar{f}(\boldsymbol{X}) - m(x)}^2 \\
& \qquad + 2 \mathbb{E}_{\boldsymbol{X}, D} \iprod{f_D(\boldsymbol{X}) - \bar{f}(\boldsymbol{X}), \bar{f}(\boldsymbol{X}) - m(x)} \\
& \qquad + \underline{\mathbb{E}_{\boldsymbol{X}, Y} \llnorm{m(x) - Y}^2} \\
& = \dots + 2 \mathbb{E}_{\boldsymbol{X}} \mathbb{E}_{D} \iprod{f_D(\boldsymbol{X}) - \bar{f}(\boldsymbol{X}), \bar{f}(\boldsymbol{X}) - m(x)} + \underline{\dots} \\
& = \dots + 2 \mathbb{E}_{\boldsymbol{X}} \iprod{\mathbb{E}_{D} [f_D(\boldsymbol{X})] - \bar{f}(\boldsymbol{X}), \bar{f}(\boldsymbol{X}) - m(x)} + \underline{\dots} \\
& = \dots + 2 \mathbb{E}_{\boldsymbol{X}} \iprod{\bar{f}(\boldsymbol{X}) - \bar{f}(\boldsymbol{X}), \bar{f}(\boldsymbol{X}) - m(x)} + \underline{\dots} \\
& = \dots + 0 + \underline{\dots} \\
& = \mathbb{E}_{\boldsymbol{X}, D} \llnorm{f_D(\boldsymbol{X}) - \bar{f}(\boldsymbol{X})}^2 + \mathbb{E}_{\boldsymbol{X}} \llnorm{\bar{f}(\boldsymbol{X}) - m(x)}^2 + \underline{\mathbb{E}_{\boldsymbol{X}, Y} \llnorm{m(x) - Y}^2} \\
& = \underbrace{\mathbb{E}_{\boldsymbol{X}, D} \llnorm{f_D(\boldsymbol{X}) - \mathbb{E}_D [f_D(\boldsymbol{X})]}^2}_{\text{variance}} + \underbrace{\mathbb{E}_{\boldsymbol{X}} \llnorm{\mathbb{E}_D [f_D(\boldsymbol{X})] - m(x)}^2}_{\text{bias}^2} + \underbrace{\mathbb{E}_{\boldsymbol{X}, Y} \llnorm{m(x) - Y}^2}_{\text{noise (variance)}} \\
\end{split}
\end{equation}
\end{question}
\begin{question}[Sampling $\rightarrow$ Training]
Replace expectation with sample average: $\paren{\boldsymbol{X_i}, Y_i} \tilde P$.
\begin{equation}
\min_{f: \mathcal{X} \rightarrow \mathcal{Y}} \hat{\mathbb{E}} \llnorm{f(\boldsymbol{X}) - Y}^2 \coloneq \frac{1}{n} \sum_{i=1}^n \llnorm{f(\boldsymbol{X_i}) - Y_i}^2
\end{equation}
Uniform law of large numbers: as training data size $n \rightarrow \argmin \mathbb{E}$,
$\hat{\mathbb{E}} \rightarrow \mathbb{E}$ and (hopefully) $\argmin \hat{\mathbb{E}} \rightarrow \mathbb{E}$.
\end{question}
\begin{question}[Linear Regression]
Padding: $\boldsymbol{x} \leftarrow \begin{pmatrix}\boldsymbol{x} \\ 1\end{pmatrix}$, $W \leftarrow \begin{pmatrix}W \\ \boldsymbol{b}\end{pmatrix}$
\begin{align*}
X & = [\boldsymbol{x_1}, \dots, \boldsymbol{x_n}] \in \mathbb{R}^{(d+1) \times n}, \\
Y & = [\boldsymbol{y_1}, \dots, \boldsymbol{y_n}] \in \mathbb{R}^{t \times n}, \\
W & \in \mathbb{R}^{t \times (d+1)}, \\
\fnorm{A} & = \sqrt{\sum_{ij} a_{ij}^2}
\end{align*}
Linear regression is:
\begin{equation}
\min_{W \in \mathbb{R}^{t \times (d+1)}} \frac{1}{n} \fnorm{WX - Y}^2
\end{equation}
\end{question}
\begin{question}[Optimality Condition]
If $\boldsymbol{w}$ is a minimizer (or maximizer) of a differentiable function $f$
\textbf{over an open set}, then $f'(\boldsymbol{w}) = 0$.
\end{question}
\begin{question}[Solving Linear Regression]
\begin{equation}
L(W) = \frac{1}{n} \fnorm{WX-Y}^2
\end{equation}
\begin{equation}
\begin{split}
\nabla_W L(W) & = \frac{2}{n} (WX-Y) X^T = 0 \\
& \Rightarrow W X X^T = Y X^T \\
& \Rightarrow W = Y X^T (X X^T)^{-1} \\
\end{split}
\end{equation}
\end{question}
\begin{question}[Ill-Conditioning]
Slight pertubation leads to chaotic behavior, which happens whenever
$X$ is ill-conditioned, i.e., (close to) rank-deficient.
\vspace{5mm}
Rank-deficient X means:
\begin{enumerate}
\item two columns in $X$ are linearly dependent (or simply the same)
\item but the corresponding $y$ might be different
\end{enumerate}
\end{question}
\begin{question}[Ridge Regression]
\begin{equation}
\min_{W} \frac{1}{n} \fnorm{WX - Y}^2 + \lambda \fnorm{W}^2
\end{equation}
\begin{equation}
\begin{split}
\nabla_W L(W) & = \frac{2}{n} (WX - Y) X^T + 2 \lambda W = 0 \\
& \Rightarrow W X X^T - Y X^T + \lambda W = 0 \\
& \Rightarrow W (X X^T + n \lambda I) = Y X^T
\end{split}
\end{equation}
\begin{equation}
\begin{split}
& X = U \Sigma V^T \\
\Rightarrow & X X^T = U \Sigma (V^T V) \Sigma U^T = U \Sigma^2 U^T \\
\Rightarrow & X X^T + n \lambda I = U \underbrace{(\Sigma^2 + n \lambda I)}_{\text{strictly positive}} U^T \\
\Rightarrow & X X^T + n \lambda I \text{ is of full-rank}
\end{split}
\end{equation}
$\lambda$ is regularization parameter. $\lambda = \infty \Rightarrow W \equiv \boldsymbol{0}$.
\end{question}
\begin{question}[Regularization $\equiv$ Data Augmentation]
\begin{equation}
\frac{1}{n} \fnorm{WX - Y}^2 + \lambda \fnorm{W}^2
= \frac{1}{n} \fnorm{W \left[X \quad \sqrt{n\lambda} I \right] - \left[Y \quad \boldsymbol{0}\right]}^2
\end{equation}
\end{question}
\section{Logistic Regression}
\begin{question}[Max Likelihood Estimation]
Let $\mathcal{Y} = \{0, 1\}$.
Learn confidence $p(\boldsymbol{x}; \boldsymbol{w}) \coloneq \Pr (Y=1|X=\boldsymbol{x})$.
\begin{equation}
\begin{split}
\max_{\boldsymbol{w}} \Pr (Y_1 = y_1, \dots, Y_n = y_n)
& = \max_{\boldsymbol{w}} \prod_{i=1}^n \Pr (Y_i = y_i | X_i = x_i) \\
& \stackrel{\mathcal{Y}=\{0, 1\}}{=} \max_{\boldsymbol{w}} \prod_{i=1}^n \left[p(\boldsymbol{x_i}; \boldsymbol{w})\right]^{y_i} \left[1 - p(\boldsymbol{x_i}; \boldsymbol{w})\right]^{1 - y_i} \\
\end{split}
\end{equation}
Use negative log-likelihood:
\begin{equation}
\min_{\boldsymbol{w}} \sum_{i=1}^n \left[ -y_i \log p(\boldsymbol{x_i}; \boldsymbol{w}) - (1 - y_i) \log (1 - p(\boldsymbol{x_i}; \boldsymbol{w})) \right]
\end{equation}
\end{question}
\begin{question}[Odds Ratio and Sigmoid]
\begin{equation}
\text{Odds Ratio} = \frac{\Pr}{1 - \Pr}
\end{equation}
Assume $\log \frac{p(\boldsymbol{x};\boldsymbol{w})}{1 - p(\boldsymbol{x};\boldsymbol{w})} = \iprod{\boldsymbol{x}, \boldsymbol{w}}$.
\vspace{5mm}
The Sigmoid transformation is:
\begin{equation}
p(\boldsymbol{x}; \boldsymbol{w}) = \frac{1}{1 + \exp (-\iprod{\boldsymbol{x}, \boldsymbol{w}})}
\end{equation}
\end{question}
\begin{question}[Logistic Regression]
Plug the sigmoid in the negative log-likelihood:
\begin{equation}
\begin{split}
& \min_{\boldsymbol{w}} \sum_{i=1}^n \left[ -y_i \log p(\boldsymbol{x_i}; \boldsymbol{w}) - (1 - y_i) \log (1 - p(\boldsymbol{x_i}; \boldsymbol{w})) \right] \\
= & \min_{\boldsymbol{w}} \sum_{i=1}^n \left[ y_i \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] - (1 - y_i) \log \left(1 - \frac{1}{1 + \exp (-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})}\right) \right] \\
= & \min_{\boldsymbol{w}} \sum_{i=1}^n \left[ y_i \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] - (1 - y_i) \log \left(\frac{\exp (-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})}{1 + \exp (-\iprod{\boldsymbol{x}, \boldsymbol{w}})}\right) \right] \\
= & \min_{\boldsymbol{w}} \sum_{i=1}^n \left[ y_i \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] + (1 - y_i) \left[\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + \log(1 + \exp (-\iprod{\boldsymbol{x}, \boldsymbol{w}}))\right] \right] \\
= & \min_{\boldsymbol{w}} \sum_{i=1}^n \left[ \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] + (1 - y_i) \iprod{\boldsymbol{x_i}, \boldsymbol{w}} \right] \\
\end{split}
\end{equation}
Because $y_i \in \{0, 1\}$, let's map it to $\{\pm 1\}$.
\begin{equation}
\begin{split}
L(\boldsymbol{w})
& \stackrel{y_i \in \{0, 1\}}{=} \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] + (1 - y_i) \iprod{\boldsymbol{x_i}, \boldsymbol{w}} \\
& \stackrel{y_i \in \{0, 1\}}{=} \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] + \log \left[ \exp ( (1 - y_i) \iprod{\boldsymbol{x_i}, \boldsymbol{w}} ) \right] \\
& \stackrel{y_i \in \{0, 1\}}{=} \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}}) \cdot \exp ( (1 - y_i) \iprod{\boldsymbol{x_i}, \boldsymbol{w}} ) ] \\
& \stackrel{y_i \in \{0, 1\}}{=} \log [\exp((1-y_i)\iprod{\boldsymbol{x_i}, \boldsymbol{w}}) + \exp(-y_i \iprod{\boldsymbol{x_i}, \boldsymbol{w}})] \\
& \stackrel{y_i \in \{0, 1\}}{=} \begin{cases}
\log [\exp(\iprod{\boldsymbol{x_i}, \boldsymbol{w}}) + 1] & y_i = 0 \\
\log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] & y_i = 1
\end{cases} \\
& \stackrel{y_i \in \{\pm 1\}}{=} \log [ 1 + \exp(-y_i \iprod{\boldsymbol{x_i}, \boldsymbol{w}}) ]
\end{split}
\end{equation}
\end{question}
\begin{question}[Multi-Class: Sigmoid $\rightarrow$ Softmax]
\begin{equation}
\Pr(Y=k | X = \boldsymbol{x}; \boldsymbol{W} = [\boldsymbol{w_1}, \dots, \boldsymbol{w_c}])
= \frac{\exp(\iprod{\boldsymbol{x}, \boldsymbol{w_k}})}{\sum_{l=1}^c \exp(\iprod{\boldsymbol{x}, \boldsymbol{w_l}})}
\end{equation}
Maximum likelihood estimation (log loss, cross-entropy loss):
\begin{equation}
\min_{\boldsymbol{W}} \sum_{i=1}^n \left[ -\log \frac{\exp(\iprod{\boldsymbol{x}, \boldsymbol{w_k}})}{\sum_{l=1}^c \exp(\iprod{\boldsymbol{x}, \boldsymbol{w_l}})} \right]
\end{equation}
\end{question}
\section{Hard-Margin Support Vector Machines}
\begin{question}[Distance from a Point to a Hyperplane]
Let $H \coloneq \{\boldsymbol{x}: \iprod{\boldsymbol{x}, \boldsymbol{w}} + b = 0\}$, $\boldsymbol{x}$ be any vector in $H$.
\begin{equation}
\text{Distance}(\boldsymbol{x_i}, \boldsymbol{w})
= \frac{|\iprod{\boldsymbol{x_i} - \boldsymbol{x}, \boldsymbol{w}}|}{\llnorm{\boldsymbol{w}}}
= \frac{|\iprod{\boldsymbol{x_i}, \boldsymbol{w}} - \iprod{\boldsymbol{x}, \boldsymbol{w}}|}{\llnorm{\boldsymbol{w}}}
\stackrel{\boldsymbol{x} \in H}{=} \frac{|\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b|}{\llnorm{\boldsymbol{w}}}
\stackrel{y_i \hat{y_i} > 0}{=} \frac{y_i \hat{y_i}}{\llnorm{\boldsymbol{w}}}
\end{equation}
\end{question}
\begin{question}[Margin Maximization]
Margin is the smallest distance to $H$ among all separable data.
\begin{equation}
\max_{\boldsymbol{w}, b} \min_i \frac{y_i \hat{y_i}}{\llnorm{\boldsymbol{w}}}
\text{, such that }
\forall i, y_i \hat{y_i} > 0
\end{equation}
Let $c > 0$, then $\boldsymbol{w} = c\boldsymbol{w}, b = cb$ keeps the loss same:
\begin{equation}
\begin{split}
\max_{\boldsymbol{w}, b} \min_i \frac{c y_i \hat{y_i}}{\llnorm{c\boldsymbol{w}}}
& = \max_{\boldsymbol{w}, b} \min_i \frac{y_i (\iprod{\boldsymbol{x}, c\boldsymbol{w}} + cb)}{\llnorm{c\boldsymbol{w}}} \\
& = \max_{\boldsymbol{w}, b} \min_i \frac{c y_i (\iprod{\boldsymbol{x}, \boldsymbol{w}} + b)}{c \llnorm{\boldsymbol{w}}} \\
& = \max_{\boldsymbol{w}, b} \min_i \frac{y_i (\iprod{\boldsymbol{x}, \boldsymbol{w}} + b)}{\llnorm{\boldsymbol{w}}} \\
& = \max_{\boldsymbol{w}, b} \min_i \frac{y_i \hat{y_i}}{\llnorm{\boldsymbol{w}}}
\end{split}
\end{equation}
Let $c = \frac{1}{\min_i y_i \hat{y_i}}$,
\begin{equation}
\begin{split}
\max_{\boldsymbol{w}, b} \min_i \frac{c y_i \hat{y_i}}{c \llnorm{\boldsymbol{w}}}
& = \max_{\boldsymbol{w}, b} \frac{1}{c \llnorm{\boldsymbol{w}}} \\
& = \max_{\boldsymbol{w}, b} \frac{1}{\llnorm{\boldsymbol{w}}} \text{ s.t. } \min_i y_i \hat{y_i} = 1
\end{split}
\end{equation}
Max $\rightarrow$ Min:
\begin{equation}
\min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 \text{ s.t. } \forall i, y_i \hat{y_i} \geq 1
\end{equation}
\end{question}
\begin{question}[Hard-Margin SVM v.s. Perceptron]
\begin{align}
\text{Hard-Margin SVM:} \qquad
& \min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 & \text{ s.t. } \forall i, y_i \hat{y_i} \geq 1 \\
\text{Perceptron:} \qquad
& \min_{\boldsymbol{w}, b} 0 & \text{ s.t. } \forall i, y_i \hat{y_i} \geq 1
\end{align}
\end{question}
\begin{question}[Lagrangian Dual]
Dual variables $\boldsymbol{\alpha} \in \mathbb{R}^n$.
\begin{equation}
\begin{split}
\min_{\boldsymbol{w}, b} \; \max_{\boldsymbol{\alpha} \geq 0} \; \frac{1}{2} \llnorm{\boldsymbol{w}}^2 - \sum_i \alpha_i \left[ y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) - 1 \right]
& = \min_{\boldsymbol{w}, b} \; \begin{cases}
+ \infty, & \text{if } \exists i, y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) < 1 \quad (\alpha_i = + \infty) \\
\frac{1}{2} \llnorm{\boldsymbol{w}}^2 , & \text{if } \forall i, y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) \geq 1 \quad (\forall i, \alpha_i = 0) \\
\end{cases} \\
& = \min_{\boldsymbol{w}, b} \; \frac{1}{2} \llnorm{\boldsymbol{w}}^2, \quad \text{s.t. } \forall i, y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) \geq 1 \\
& = \min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 \text{ s.t. } \forall i, y_i \hat{y_i} \geq 1
\end{split}
\end{equation}
Swap $\min$ and $\max$:
\begin{equation} \label{lagrangian-max-min}
\max_{\boldsymbol{\alpha} \geq 0} \; \min_{\boldsymbol{w}, b} \; \frac{1}{2} \llnorm{\boldsymbol{w}}^2 - \sum_i \alpha_i \left[ y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) - 1 \right]
\end{equation}
Solve inner problem by setting derivative to 0:
\begin{equation} \label{solve-derivative-for-svm-dual}
\frac{\delta}{\delta \boldsymbol{w}} = \boldsymbol{w} - \sum_i \alpha_i y_i \boldsymbol{x_i} = 0,
\qquad
\frac{\delta}{\delta b} = - \sum_i \alpha_i y_i = 0,
\end{equation}
Plug them into the loss:
\begin{equation}
\begin{split}
L(\boldsymbol{\alpha})
& = \min_{\boldsymbol{w}, b} \; \frac{1}{2} \llnorm{\boldsymbol{w}}^2 - \sum_i \alpha_i \left[ y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) - 1 \right] \\
& = \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2 - \sum_i \alpha_i y_i \iprod{\boldsymbol{x_i}, \boldsymbol{w}} - \sum_i \alpha_i y_i b + \sum_i \alpha_i \\
& = \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2 - \iprod{\sum_i \alpha_i y_i \boldsymbol{x_i}, \sum_i \alpha_i y_i \boldsymbol{x_i}} - b \sum_i \alpha_i y_i + \sum_i \alpha_i \\
& = \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2 - \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2 + \sum_i \alpha_i \\
& = \sum_i \alpha_i - \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2,
\qquad \text{s.t. } \sum_i \alpha_i y_i = 0
\end{split}
\end{equation}
So, \ref{lagrangian-max-min} is solved as:
\begin{equation}
\max_{\boldsymbol{\alpha} \geq 0} \sum_i \alpha_i - \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2,
\qquad \text{s.t. } \sum_i \alpha_i y_i = 0
\end{equation}
Max $\rightarrow$ min and expand the norm:
\begin{equation}
\min_{\boldsymbol{\alpha} \geq 0} - \sum_i \alpha_i + \frac{1}{2} \sum_i \sum_j \alpha_i \alpha_j y_i y_j \underbrace{\iprod{\boldsymbol{x_i}, \boldsymbol{x_j}}}_{\text{Kernel, closed form w.r.t. }\boldsymbol{x_i}, \boldsymbol{x_j}},
\qquad \text{s.t. } \sum_i \alpha_i y_i = 0
\end{equation}
\end{question}
\begin{question}[Support Vectors]
From \ref{solve-derivative-for-svm-dual}, we know $\boldsymbol{w} = \sum_i \alpha_i y_i \boldsymbol{x_i}$.
Vectors with $\alpha_i \neq 0$ are support vectors, which lie on the margin.
\end{question}
\section{Soft-Margin Support Vector Machines}
\begin{question}[Goal]
minimize over $\boldsymbol{w}, b$,
\begin{equation}
\Pr(Y \neq \sign(\hat Y)) = \Pr(Y \hat Y \leq 0)
= \mathbb{E} \; \underbrace{\mathbb{I} [Y \hat Y \leq 0]}_{indicator function}
\coloneq \mathbb{E} \; l_{0-1}(Y \hat Y)
\end{equation}
where $\hat Y = \iprod{X, \boldsymbol{w}} + b, Y = \pm 1$.
\begin{equation}
\begin{split}
\min_{\hat Y: \mathcal{X} \rightarrow \mathbb{R}} \mathbb{E} \; l_{0-1}(Y \hat Y)
& = \min_{\hat Y: \mathcal{X} \rightarrow \mathbb{R}} \mathbb{E}_X \mathbb{E}_{Y|X} \; l_{0-1}(Y \hat Y) \\
& = \mathbb{E}_X \min_{\hat Y: \mathcal{X} \rightarrow \mathbb{R}} \mathbb{E}_{Y|X} \; l_{0-1}(Y \hat Y) \\
\end{split}
\end{equation}
Minimizing the 0-1 error is \textbf{NP-hard}.
\end{question}
\begin{question}[Bayes Rule]
\begin{equation}
\eta(\boldsymbol{x}) \coloneq \argmax_{\hat y \in \mathbb{R}} \Pr (Y = \hat y | X = \boldsymbol{x})
\end{equation}
\begin{equation}
\eta(\boldsymbol{x}) \coloneq \argmin_{\hat y \in \mathbb{R}} \mathbb{E}_{Y|X=\boldsymbol{x}} \; l_{0-1}(Y \hat y)
\end{equation}
\end{question}
\begin{question}[Classification Calibrated]
A loss $l(y \hat y)$ is classification calibrated, iff $\forall \boldsymbol{x}$,
\begin{equation*}
\hat y(\boldsymbol{x}) \coloneq \argmin_{\hat y \in \mathbb{R}} \mathbb{E}_{Y | X = \boldsymbol{x}} \; l(Y \hat y)
\end{equation*}
has the same sign as the Bayes rule $\eta(\boldsymbol{x}) \coloneq \argmin_{\hat y \in \mathbb{R}} \mathbb{E}_{Y|X=\boldsymbol{x}} \; l_{0-1}(Y \hat y)$
Notice: $\eta(\boldsymbol{x}), \hat y (\boldsymbol{x})$ provide the \textbf{score}, their sign provides the prediction.
\end{question}
\begin{question}[Characterization under Convexity]
Any \textbf{convex} loss $l$ is classification calibrated iff $l$ is differentiable
at $0$ and $l'(0) < 0$.
\end{question}
\begin{question}[Hinge Loss]
\begin{equation}
l_{hinge}(y \hat y) = (1 - y \hat y)^+ \coloneq \max \{0, 1 - y \hat y\}
= \begin{cases}
1 - y \hat y, & \text{if } y \hat y < 1 \\
0, & \text{otherwise}
\end{cases}
\end{equation}
The classifier that minimizes the expected hinge loss minimizes the expected 0-1 loss.
\end{question}
\begin{question}[Soft-Margin SVM]
\begin{equation}
\min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 + C \cdot \sum_i l_{hinge}(y_i \hat y_i), \quad \text{s.t. } \hat y_i = \iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b
\end{equation}
\end{question}
\begin{question}[Lagrangian Dual]
Apply $C \cdot l_{hinge}(t) \coloneq \max\{0, C(1 - t)\} = \max_{0 \leq \alpha \leq C} \alpha (1 - t)$
\begin{equation}
\min_{\boldsymbol{w}, b} \max_{0 \leq \boldsymbol{\alpha} \leq C} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 + \sum_i \alpha_i (1 - y_i \hat y_i)
\end{equation}
Swap $\min$ with $\max$:
\begin{equation}
\max_{0 \leq \boldsymbol{\alpha} \leq C} \min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 + \sum_i \alpha_i (1 - y_i \hat y_i)
\end{equation}
Solve it by setting derivative to 0:
\begin{equation}
\frac{\delta}{\delta \boldsymbol{w}} = \boldsymbol{w} - \sum_i \alpha_i y_i \boldsymbol{x_i} = 0,
\qquad
\frac{\delta}{\delta b} = - \sum_i \alpha_i y_i = 0,
\end{equation}
Plug them into the loss:
\begin{equation}
\begin{split}
\max_{0 \leq \boldsymbol{\alpha} \leq C} \min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 + \sum_i \alpha_i (1 - y_i \hat y_i)
& = \max_{0 \leq \boldsymbol{\alpha} \leq C} \min_{\boldsymbol{w}, b} \; \frac{1}{2} \llnorm{\boldsymbol{w}}^2 + \sum_i \alpha_i \left[ 1 - y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) \right] \\
& = \max_{0 \leq \boldsymbol{\alpha} \leq C} \sum_i \alpha_i - \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2,
\qquad \text{s.t. } \sum_i \alpha_i y_i = 0
\end{split}
\end{equation}
Max $\rightarrow$ min and expand the norm:
\begin{equation}
\min_{0 \leq \boldsymbol{\alpha} \leq C} - \sum_i \alpha_i + \frac{1}{2} \sum_i \sum_j \alpha_i \alpha_j y_i y_j \underbrace{\iprod{\boldsymbol{x_i}, \boldsymbol{x_j}}}_{\text{Kernel, closed form w.r.t. }\boldsymbol{x_i}, \boldsymbol{x_j}},
\qquad \text{s.t. } \sum_i \alpha_i y_i = 0
\end{equation}
$C \rightarrow \infty \Rightarrow \text{Hard-margin SVM}$,
$C \rightarrow 0 \Rightarrow \text{a constant classifier}$
\end{question}
\section{Reproducing Kernels}
\begin{question}[(Reproducing) Kernels]
$k: \mathcal(X) \times \mathcal{X} \rightarrow \mathbb{R}$ is a (reproducing) kernel
iff there exists some $\Phi: \mathcal{X} \rightarrow \mathcal{H}$ so that
$\iprod{\Phi(\boldsymbol{x}), \Phi(\boldsymbol{z})} = k(\boldsymbol{x}, \boldsymbol{z})$.
\begin{itemize}
\item A feature transform $\Phi$ determines the corresponding kernel $k$.
\item A kernel $k$ determines some feature transforms $\Phi$, but may not be unique. \\
E.g. $\iprod{\phi(\boldsymbol{x}), \phi(\boldsymbol{z})} = \iprod{\phi'(\boldsymbol{x}), \phi'(\boldsymbol{z})}$
\begin{enumerate}
\item $\phi(x) \coloneq [x_1^2, \sqrt{2} x_1 x_2] \in \mathbb{R}^2$
\item $\phi'(x) \coloneq [x_1^2, x_1 x_2, x_1 x_2] \in \mathbb{R}^3$
\end{enumerate}
\end{itemize}
\end{question}
\begin{question}[Mercer's Theorem]
$k: \mathcal{X} \times \mathcal{X} \rightarrow \mathbb{R}$ is a kernel, \\
iff $\forall n \in \mathbb{N}, \forall \boldsymbol{x_1}, \dots, \boldsymbol{x_n} \in \mathcal{X}$,
the kernel matrix $K$ such that $K_{ij} \coloneq k(\boldsymbol{x_i}, \boldsymbol{x_j})$ is symmetric and positive semi-definite (PSD).
\begin{equation}
k \text{ is a kernel} \Leftrightarrow \begin{cases}
K_{ij} = K_{ji} & \text{(symmetric)}\\
\iprod{\boldsymbol{\alpha}, K \boldsymbol{\alpha}} = \sum_{i=1}^n \sum_{j=1}^n \alpha_i \alpha_j K_{ij} \geq 0 \qquad \forall \boldsymbol{\alpha} \in \mathbb{R}^n & \text{(PSD)}
\end{cases}
\end{equation}
\end{question}
\begin{question}[Symmetric PSD]
For a symmetric matrix $A$, the following conditions are equivalent.
\begin{enumerate}
\item $A \succeq 0$
\item $A = U^T U$ for some matrix $U$
\item $x^T A x \geq 0$ for every $x \in \mathbb{R}^n$
\item All principal minors of $A$ are nonnegative
\end{enumerate}
\end{question}
\section{Gradient Descent}
\begin{question}[Gradient Descent Template]
Choose initial point $x^{(0)} \in \mathbb{R}^d$ and repeat:
\begin{equation}
x^{(k)} = x^{(k-1)} - \underbrace{\eta}_{\text{step size}} \nabla f(x^{(k-1)}), \quad k = 1, 2, \dots
\end{equation}
\end{question}
\begin{question}[Interpretation from Taylor Expansion]
Expand $f$ locally at $x$:
\begin{equation}
\begin{split}
f(y) & \approx f(x) + \nabla f(x)^T(y-x) + \frac{1}{2t}\llnorm{y-x}^2 \\
\Rightarrow \min_y f(y) & \approx \min_y \left[ f(x) + \nabla f(x)^T(y-x) + \frac{1}{2t}\llnorm{y-x}^2 \right] \\
\end{split}
\end{equation}
When $y - x = \frac{\nabla f(x)}{-2 \frac{1}{2t}} = -t \nabla f(x) \Longrightarrow y = x - t \nabla f(x)$, it reaches the minimum.
\end{question}
\begin{question}[$L$-smooth or $L$-Lipschitz Continuous]
$f$ is convex and differentiable. $\nabla f$ is $L$-Lipschitz continuous ($L$-smooth):
\begin{equation}
L \boldsymbol{I} \succeq \nabla^2 f(x), \forall x
\end{equation}
\end{question}
\begin{question}[Convergence Rate for Convex Case]
Assume $f$ is $L$-smooth.
Gradient descent with fixed step size $t \leq \frac{1}{L}$ satisfies:
\begin{equation}
f(x^{(k)}) - f(x^*) \leq \frac{\llnorm{x^{(0)} - x^*}^2}{2tk}
\end{equation}
We say gradient descent has convergence rate $O(\frac{1}{k})$,
i.e. $f(x^{(k)}) - f(x^*) \leq \epsilon$ can be achieved using only $O(\frac{1}{\epsilon})$ iterations.
\vspace{5mm}
\textbf{Proof}
\begin{equation}
\begin{split}
f(y)
& = f(x) + \nabla f(x)^T (y-x) + \frac{1}{2} (y-x)^T \nabla^2 f(\xi)(y-x) \\
& \leq f(x) + \nabla f(x)^T(y-x) + \frac{1}{2} L \llnorm{y-x}^2 \qquad (L\text{-smooth}, L \boldsymbol{I} \succeq \nabla^2 f(\xi))
\end{split}
\end{equation}
Plug in gradient descent:
\begin{equation} \label{convergence-1}
\begin{split}
f(x^+)
& = f(y) \\
& \leq f(x) + \nabla f(x)^T(x - t\nabla f(x)-x) + \frac{1}{2} L \llnorm{x - t\nabla f(x)-x}^2 \\
& = f(x) -(1 - \frac{1}{2}Lt) t \llnorm{\nabla f(x)}^2 \\
& \leq f(x) - \frac{1}{2} t \llnorm{\nabla f(x)}^2 \quad (t \leq \frac{1}{L})
\end{split}
\end{equation}
$f$ is convex $\Rightarrow f(x^*) \geq f(x) + \nabla f(X)^T(x^* - x) \Rightarrow f(x) \leq f(x^*) + \nabla f(x)^T(x - x^*)$
Plug this into \ref{convergence-1}:
\begin{equation}
\begin{split}
& f(x^+) \leq f(x^*) + \nabla f(x)^T (x - x^*) - \frac{1}{2} t \llnorm{\nabla f(x)}^2 \\
\Rightarrow & f(x^+) - f(x^*) \leq \frac{1}{2t}\left( 2t \nabla f(x)^T(x - x^*) - t^2 \llnorm{\nabla f(x)}^2 \right) \\
\Rightarrow & f(x^+) - f(x^*) \leq \frac{1}{2t}\left( 2t \nabla f(x)^T(x - x^*) - t^2 \llnorm{\nabla f(x)}^2 - \llnorm{x-x^*}^2 + \llnorm{x-x^*}^2 \right) \\
\Rightarrow & f(x^+) - f(x^*) \leq \frac{1}{2t}\left( \llnorm{x-x^*}^2 - (\llnorm{x-x^*}^2 + t^2 \llnorm{\nabla f(x)}^2 - 2t \nabla f(x)^T(x - x^*) ) \right) \\
\Rightarrow & f(x^+) - f(x^*) \leq \frac{1}{2t}\left( \llnorm{x-x^*}^2 - \llnorm{x - x^* - t\nabla f(x)}^2 \right) \\
\Rightarrow & f(x^+) - f(x^*) \leq \frac{1}{2t}\left( \llnorm{x-x^*}^2 - \llnorm{x^+ - x^*}^2 \right) \\
\end{split}
\end{equation}
Viewing $x^+$ as $x^{(i)}$ and $x$ as $x^{(x-1)}$:
\begin{equation}
\begin{split}
\sum_{i=1}^k \paren{ f(x^{(i)}) - f(x^*) }
& \leq \sum_{i=1}^k \frac{1}{2t} \paren{ \llnorm{x^{(i-1)} - x^*}^2 - \llnorm{x^{(i)} - x^*}^2 } \\
& = \frac{1}{2t} \paren{ \llnorm{x^{(0)} - x^*}^2 - \llnorm{x^{(k)} - x^*}^2 } \\
& \leq \frac{1}{2t} \llnorm{x^{(0)} - x^*}^2 \\
\end{split}
\end{equation}
which implies
\begin{equation}
f(x^{(k)}) \leq \frac{1}{k} \sum_{i=1}^k f(x^{(i)}) \leq f(x^*) + \frac{\llnorm{x^{(0)} - x^*}^2}{2tk}
\end{equation}
\end{question}
\begin{question}[Convergence Rate for Strong Convexity]
$f$ is differentiable, $L$-smooth, and $m$-strongly convex.
$m$-strong convexity of $f$ means $f(x) - \frac{m}{2} \llnorm{x}^2$ is convex, i.e. $\nabla^2 f(x) \succeq m \boldsymbol{I}$
Then, there is a constant $0 < \gamma < 1$ such that gradient descent with fixed step size $t \leq \frac{2}{m + L}$ satisfies:
\begin{equation}
f(x^{(k)}) - f(x^*) \leq \gamma^k \frac{L}{2} \llnorm{x^{(0)} - x^*}^2
\end{equation}
Rate is $O(\gamma^k)$. Only $O(\log_{\frac{1}{\gamma}}(\frac{1}{\epsilon}))$ iterations needed.
\end{question}
\begin{question}[Convergence Rate for Non-Convex Case]
$f$ is differentiable and $L$-smooth, but non-convex.
Gradient descent with fixed step size $t \leq \frac{1}{L}$ satisifes:
\begin{equation}
\min_{i=0,\dots,k} \llnorm{\nabla f(x^{(i)})} \leq \sqrt(\frac{2 (f(x^{(0)} - f^*))}{t(k+1)})
\end{equation}
Rate is $O(\frac{1}{\sqrt{k}})$ for finding stationary point.
$O(\frac{1}{\epsilon^2})$ iterations are needed.
\end{question}
\begin{question}[Convergence Rate for Stochastic Gradient Descent]
For convex and $L$-smooth $f$,
\begin{itemize}
\item Gradient Descent
\begin{equation}
\boldsymbol{w^+} = \boldsymbol{w} - t \cdot \frac{1}{n} \sum_{i=1}^n \nabla f_i(\boldsymbol{w})
\end{equation}
\begin{itemize}
\item Step size $t \leq \frac{1}{L}$
\item Time complexity $O(\frac{n}{\epsilon})$
\end{itemize}
\item Stochastic Gradient Descent
\begin{equation}
\boldsymbol{w^+} = \boldsymbol{w} - t \cdot \nabla f_{I_{random}}(\boldsymbol{w})
\end{equation}
\begin{itemize}
\item Step size $t = \frac{1}{k}, k = 1, 2, 3, \dots$ (adaptive step size)
\item Time complexity $O(\frac{1}{\epsilon^2})$
\end{itemize}
\end{itemize}
\end{question}
\section{Fully-Connected Neural Networks}
\begin{question}[Forward and Backward Pass of a 2-Layer MLP]
A 2-layer MLP ($k$ is the NN width, $c$ is the output dim):
\begin{align}
\boldsymbol{x} & = \text{input} & (\boldsymbol{x} \in \mathbb{R}^d) \\
\boldsymbol{z} & = \boldsymbol{W} \boldsymbol{x} + \boldsymbol{b_1} & (\boldsymbol{W} \in \mathbb{R}^{k \times d}, \boldsymbol{z}, \boldsymbol{b} \in \mathbb{R}^k) \\
\boldsymbol{h} & = \text{ReLU}(\boldsymbol{z}) & (\boldsymbol{h} \in \mathbb{R}^k) \\
\boldsymbol{\theta} & = \boldsymbol{U} \boldsymbol{h} + \boldsymbol{b_2} & (\boldsymbol{U} \in \mathbb{R}^{c \times k}, \boldsymbol{\theta}, \boldsymbol{b_2} \in \mathbb{R}^c) \\
J & = \frac{1}{2} \llnorm{\boldsymbol{\theta} - \boldsymbol{y}}^2 & (\boldsymbol{y} \in \mathbb{R}^c, J \in \mathbb{R})
\end{align}
\begin{align}
\text{ReLU} = \begin{cases}
x & x > 0 \\
0 & x \leq 0
\end{cases} \\
\text{ReLU}' = \begin{cases}
1 & x > 0 \\
0 & x \leq 0
\end{cases}
\end{align}
Backward pass ($\odot$ is the Hadamard product, i.e. element-wise product):
\begin{align}
\frac{\delta J}{\delta \boldsymbol{\theta}} & = \boldsymbol{\theta} - \boldsymbol{y} \\
\frac{\delta J}{\delta \boldsymbol{U}} & = \frac{\delta J}{\delta \boldsymbol{\theta}} \circ \frac{\delta \boldsymbol{\theta}}{\delta \boldsymbol{U}} = (\boldsymbol{\theta} - \boldsymbol{y}) \boldsymbol{h}^T \\
\frac{\delta J}{\delta \boldsymbol{b_2}} & = \frac{\delta J}{\delta \boldsymbol{\theta}} \circ \frac{\delta \boldsymbol{\theta}}{\delta \boldsymbol{b_2}} = \boldsymbol{\theta} - \boldsymbol{y} \\
\frac{\delta J}{\delta \boldsymbol{h}} & = \frac{\delta J}{\delta \boldsymbol{\theta}} \circ \frac{\delta \boldsymbol{\theta}}{\delta \boldsymbol{h}} = \boldsymbol{U^T}(\boldsymbol{\theta} - \boldsymbol{y}) \\
\frac{\delta J}{\delta \boldsymbol{z}} & = \frac{\delta J}{\delta \boldsymbol{h}} \circ \frac{\delta \boldsymbol{h}}{\delta \boldsymbol{z}} = \boldsymbol{U^T}(\boldsymbol{\theta} - \boldsymbol{y}) \odot \text{ReLU}'(\boldsymbol{z}) \\
\frac{\delta J}{\delta \boldsymbol{W}} & = \frac{\delta J}{\delta \boldsymbol{z}} \circ \frac{\delta \boldsymbol{z}}{\delta \boldsymbol{W}} = \boldsymbol{U^T}(\boldsymbol{\theta} - \boldsymbol{y}) \odot \text{ReLU}'(\boldsymbol{z}) \boldsymbol{x}^T \\
\frac{\delta J}{\delta \boldsymbol{b_1}} & = \frac{\delta J}{\delta \boldsymbol{z}} \circ \frac{\delta \boldsymbol{z}}{\delta \boldsymbol{b_1}} = \boldsymbol{U^T}(\boldsymbol{\theta} - \boldsymbol{y}) \odot \text{ReLU}'(\boldsymbol{z}) \\
\end{align}
\end{question}
\begin{question}[Universal Approximation Theorem by 2-Layer NNs]
For any continuous function $f: \mathbb{R}^d \rightarrow \mathbb{R}^c$
and any $\epsilon > 0$,
there exists $k \in \mathbb{N}, \boldsymbol{W} \in \mathbb{R}^{k \times d}, \boldsymbol{b} \in \mathbb{R}^k, \boldsymbol{U} \in \mathbb{R}^{c \times k}$
such that
\begin{equation}
\sup_{\boldsymbol{x}} \llnorm{f(\boldsymbol{x}) - g(\boldsymbol{x})} < \epsilon
\end{equation}
where $g(\boldsymbol{x}) = \boldsymbol{U}(\sigma(\boldsymbol{W}\boldsymbol{x} + \boldsymbol{b}))$
and $\sigma$ is the element-wise ReLU operation.
As long as the 2-layer MLP is wide enough, it can approximate any
continuous function arbitrarily closely.
\end{question}
\section{Convolutional Neural Networks}
\begin{question}[Controlling the Convolution]
Hyperparameters.
\begin{itemize}
\item \textbf{Filter (kernel) size}: width $times$ height.
\item \textbf{Number of filters (kernels)}. \\
Weights are not shared between different filters (kernels)
\item \textbf{Stride}: how many pixels the filter moves each time.
\item \textbf{Padding}: add zeros around the boundary of the input.
\end{itemize}
\end{question}
\begin{question}[Size Calculation]
\begin{align*}
\text{Input size: } & m \times n \times c_{in} \\
\text{Filter size: } & a \times b \times c_{in} \\
\text{Stride: } & s \times t \\
\text{Padding: } & p \times q
\end{align*}
Output size:
\begin{equation}
\floor*{1 + \frac{m + 2p - a}{s}} \times \floor*{1 + \frac{n + 2q - b}{t}}
\end{equation}
\end{question}
\part{For Final}
\section{Transformer}
\begin{question}[Attention Layer Inputs and Outputs]
Inputs: $V \in \mathcal{R}^{n \times d}$, $K \in \mathcal{R}^{n \times d}$, $Q \in \mathcal{R}^{m \times d}$,
Outputs: an $m \times d$ matrix.
\begin{itemize}
\item \textbf{Self Attention}: $m = n$,
\item \textbf{Cross Attention}: $m \neq n$ where $m$ is the sequence length of decoder, $n$ is the sequence length of encoder.
\end{itemize}
\end{question}
\begin{question}[Attention Layer Calculation]
\begin{equation}
\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d}}\right)V
\end{equation}
Softmax is row-wise, i.e. for each row of $QK^T$, it is normalized to sum to 1.
\end{question}
\begin{question}[Learnable Attention Layer]
\begin{equation}
\text{Attention}(XW^v, XW^k, XW^q) = \text{softmax}\left(\frac{XW^q(XW^k)^T}{\sqrt{d}}\right)XW^v
\end{equation}
\end{question}
\begin{question}[RMSNorm (LLaMA's Choice)]
\begin{equation}
\bar{a_i} = \frac{a_i}{\text{RMS}(a)}\gamma = \frac{a_i}{\sqrt{\frac{1}{d}\sum_{j=1}^d a_j^2}}\gamma
\end{equation}
\end{question}
\begin{question}[Transformer Loss]
\begin{equation}
\min_{W} \hat{\mathbb{E}}\left[ - \iprod{Y, \log \hat{Y}} \right]
\end{equation}
$Y$ is output sequence, one-hot; \\
$\hat{Y}$ is the predicted probabilities
\end{question}
\begin{question}[Transformer Implementation] As following.
\begin{lstlisting}[language=python]
import torch
import torch.nn as nn
import torch.F as F
import math
class RMSNorm(nn.Module):
def __init__(self, hidden_dim, eps = 1e-6):
super().__init__()
self.eps = eps
self.weight = nn.Parameter(torch.ones(hidden_dim))
def forward(self, hidden_state):
norm = hidden_state.pow(2).mean(-1, keepdim = True)
output = hidden_state * self.weight * torch.rsqrt(norm + self.eps)
return output
class MultiHeadAttention(nn.Module):
def __init__(self, hidden_dim, num_heads):
super().__init__()
self.hidden_dim = hidden_dim
self.num_heads = num_heads
self.head_dim = hidden_dim // num_heads
self.q_linear = nn.linear(hidden_dim, hidden_dim)
self.k_linear = nn.linear(hidden_dim, hidden_dim)
self.v_linear = nn.linear(hidden_dim, hidden_dim)
self.o_linear = nn.linear(hidden_dim, hidden_dim)
self.norm = RMSNorm(hidden_dim)
def forward(self, hidden_state, mask, past_kv = None, use_cache = True):
bs = hidden_state.shape[0]
residual = hidden_state
hidden_state = self.norm(hidden_state) # LLAMA style normalization
q = self.q_linear(hidden_state) # (bs, seqlen, hidden_dim)
k = self.k_linear(hidden_state) # (bs, seqlen, hidden_dim)
v = self.v_linear(hidden_state) # (bs, seqlen, hidden_dim)
q = q.view(bs, -1, self.num_heads, self.head_dim).tranpose(1, 2)
k = k.view(bs, -1, self.num_heads, self.head_dim).tranpose(1, 2)
v = v.view(bs, -1, self.num_heads, self.head_dim).tranpose(1, 2)
# (bs, nums_head, seqlen, head_dim)
q, k = apply_rope(q, k)
# kv cache
if past_kv is not None:
past_k, past_v = past_kv
k = torch.cat([past_k, k], dim = 2)
v = torch.cat([past_v, v], dim = 2)
new_past_kv = (k, v) if use_cache else None
# compute attention
attention_scores = torch.matmul(q, k.tranpose(-1, -2)) / math.sqrt(self.head_dim)
attention_scores += mask * -1e9
attention_scores = F.softmax(attention_scores, dim = -1)
output = torch.matmul(attention_scores, v)
# concat
output = output.tranpose(1, 2).contiguous().view(bs, -1, self.hidden_dim)
# o_linear
output = self.o_linear(output)
output += residual
return output, new_past_kv if use_cache else output
\end{lstlisting}
\end{question}
\section{Large Language Models}
\begin{question}[BERT v.s. GPT]
BERT is encoder; GPT is decoder.
\begin{itemize}
\item BERT predicts middle words; GPT predicts the next word.
\item BERT is \textbf{NOT} auto-regressive; GPT is auto-regressive.
\end{itemize}
\end{question}
\begin{question}[GPT -- Generative Pre-Training]
\begin{equation}
\min_{\Theta} \hat{\mathbb{E}} - \log \prod_{j=1}^m \Pr(x_j \vert x_1, \dots, x_{j-1}; \Theta)
\end{equation}
\end{question}
\begin{question}[Fine-Tuning Tasks]
Supervised fine-tuning tasks:
\begin{equation}
\min_{\Theta} \underbrace{- \hat{\mathbb{E}} \log \Pr(y \vert X_{1:m}; \Theta)}_{\text{task-aware supervised loss}} \underbrace{- \lambda \hat{\mathbb{E}} \log \prod_{j=1}^m \Pr(x_j \vert X_{1:j-1}; \Theta)}_{\text{pretraining loss}}
\end{equation}
\end{question}
\begin{question}[BERT $\rightarrow$ RoBERTa]
Training longer, with bigger batches, over more data and longer sequence.
Removing the next sentence prediction objective.
\end{question}
\begin{question}[Sentence-BERT]
a twin network architecture that uses BERT to derive sentence embeddings.
\end{question}
\begin{question}[GPT-2]
1.5B parameters.
\begin{itemize}
\item 10x larger than GPT-1
\item Training method is same as GPT-1.
\item Performs on par with BERT on fine-tuning tasks.
\item Good at zero-shot learning.
\item Open-source.
\end{itemize}
\end{question}
\begin{question}[GPT-3]
175B parameters.
\begin{itemize}
\item 100x larger than GPT-2.
\item Training method is same as GPT-2.
\item New phenomenon: \textbf{in-context learning} (ICL, or few-shot learning) and \textbf{chain-of-thoughts} (CoT).
\end{itemize}
\end{question}
\begin{question}[GPT-3.5 -- RLHF]
Reinforcement Learning from Human Feedback (RLHF).
\begin{itemize}
\item state = prompt
\item action = model output
\item policy function = LLM
\item reward = levels of matching human feedback
\end{itemize}
Pari-wise comparison loss to train reward model $r_\theta$:
\begin{equation}
\mathcal{L}_{\text{pair}}(\theta) = - \frac{1}{\binom{K}{2}} \mathbb{E}_{(x, y_w, y_l)} \left[ \log ( \sigma(r_\theta(x, y_w) - r_\theta(x, y_l)) ) \right]
\end{equation}
Proximal Policy Optimization (PPO) to maximize objective:
\begin{equation}
\max_{\Phi} \mathbb{E}_{(x, y)} \left[ \underbrace{r_\theta(x, y)}_{\text{maximize reward}} - \beta \underbrace{\log\left( \frac{\pi_\Phi^{\text{RL}}(y \vert x)}{\pi^{\text{SFT}}(y \vert x)}\right) }_{\text{model is close to SFT model}} + \gamma \underbrace{\mathbb{E}[\log(\pi_\Phi^{\text{RL}}(x))]}_{\text{pretraining loss}} \right]
\end{equation}
\end{question}
\section{Speculative Sampling}
\begin{question}[Reject Sampling for Check]
Check in parallel.
\begin{itemize}
\item $r \sim U(0, 1), \quad \text{if } r < \underbrace{\min\left(1, \frac{p(t)}{q(t)}\right)}_{\text{accept rate}}$, next token $= t$.
\item else: next token $= t' \sim \underbrace{\text{norm}(\max(0, p-q))}_{\text{residual distribution}}$.
\end{itemize}
\end{question}
\begin{question}[Proof: Reject Sampling $\equiv t \sim p$]
\begin{align}
\begin{split}
\min(p(t), q(t)) + \max(0, p(t) - q(t))
&= \begin{cases}
p(t) + 0 & \text{if } p(t) < q(t) \\
q(t) + p(t) - q(t) & \text{if } p(t) \geq q(t)
\end{cases} \\
&= p(t)
\end{split} \\
\Rightarrow & \sum_t (\min(p(t), q(t)) + \max(0, p(t) - q(t))) = \sum_t p(t) = 1 \\
\Rightarrow & 1 - \sum_t \min(p(t), q(t)) = \sum_t \max(0, p(t) - q(t))
\end{align}
\begin{equation}
\begin{split}
\Pr(X = t) &= \Pr(\tilde{X} = t) \Pr(\tilde{X}\text{ accept} \vert \tilde{X} = t) + \Pr(\tilde{X}\text{ reject}) \Pr(\tilde{\tilde{X}} = t \vert \tilde{X}\text{ reject}) \\
&= q(t) \cdot \min\left(1, \frac{p(t)}{q(t)}\right) + (1 - \Pr(\tilde{X}\text{ accept})) \cdot \text{norm}(\max(0, p(t) - q(t))) \\
&= \min\left(q(t), p(t)\right) + (1 - \sum_t \min\left(p(t), q(t)\right)) \cdot \frac{\max(0, p(t) - q(t))}{\sum_t \max(0, p(t) - q(t))} \\
&= \min\left(q(t), p(t)\right) + \max(0, p(t) - q(t)) \\
&= p(t)
\end{split}
\end{equation}
\end{question}
\section{Generative Adversarial Networks}
\begin{question}[Representation through Push-Forward]
Let $r$ be any continuous distribution on $\mathbb{R}^h$.
For any distribution $p$ on $\mathbb{R}^d$,
there exists push-forward maps $G: \mathbb{R}^h \rightarrow \mathbb{R}^d$ such that
\begin{equation}
\boldsymbol{z} \sim r \Longrightarrow G(\boldsymbol{z}) \sim p
\end{equation}
\end{question}
\begin{question}[Discriminator's Goal]
For a fixed generator $G$, minimize a log loss over $D$:
\begin{itemize}
\item If $x$ is real, minimize $-\log D(x)$;
\item If $x$ is fake, minimize $-\log(1 - D(x))$.
\end{itemize}
\begin{equation}
\min_D -\frac{1}{2} \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D(x)\right] - \frac{1}{2} \mathbb{E}_{z \sim r} \left[\log(1 - D(G(z)))\right]
\end{equation}
\end{question}
\begin{question}[Generator's Goal]
For a fixed discriminator $D$, maximize a log loss over $G$:
\begin{equation}
\begingroup\color{red}{\max_G}\endgroup -\frac{1}{2} \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D(x)\right] - \frac{1}{2} \mathbb{E}_{z \sim r} \left[\log(1 - D(G(z)))\right]
\end{equation}
\end{question}
\begin{question}[Solver]
\begin{equation}
\min_G \max_D V(G, D) = \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D(x)\right] + \mathbb{E}_{z \sim r} \left[\log(1 - D(G(z)))\right]
\end{equation}
Solved by alternative minimization-maximization:
\begin{itemize}
\item G step: Fix $D$ and update $G$ by one-step gradient descent
\item D step: Fix $G$ and update $D$ by one-step gradient descent
\item Repeat until the algorithm reaches an approximate equilibrium
\end{itemize}
\end{question}
\begin{question}[Solution of $D^*$]
Let $p_g(x)$ be the density of $x$ estimated by the generator $G$.
For $G$ fixed, the optimal discriminator $D$ is $D_G^*(x) = \frac{p_{\text{data}}(x)}{p_{\text{data}}(x) + p_g(x)}$
\textbf{Proof:}
\begin{equation}
\begin{split}
V(G, D) &\coloneq \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D(x)\right] + \mathbb{E}_{z \sim r} \left[\log(1 - D(G(z)))\right] \\
&= \int \log D(x) p_{\text{data}}(x) dx + \int_z \log(1 - D(G(z))) p_z(z) dz \\
&= \int \underbrace{\log D(x) p_{\text{data}}(x) + p_g(x) \log(1 - D(x))}_{f(D(x))} dx
\end{split}
\end{equation}
For any fixed $x$, taking derivative $= 0$:
\begin{equation}
\begin{split}
f'(D(x)) &= \frac{p_{\text{data}}(x)}{D(x)} - \frac{p_g(x)}{1 - D(x)} = 0 \\
D_G^*(x) &= \frac{p_{\text{data}}(x)}{p_{\text{data}}(x) + p_g(x)}
\end{split}
\end{equation}
\end{question}
\begin{question}[Solution of $G^*$]
The global minimum of $\min_G \max_D V(G, D)$ is achieved if
and only if $p_g = p_{\text{data}}$. The optimal objective value is $-\log4$.
\textbf{Proof:}
\begin{equation}
\begin{split}
V(G, D_G^*) &= \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D_G^*(x)\right] + \mathbb{E}_{z \sim r} \left[\log(1 - D_G^*(G(z)))\right] \\
&= \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D_G^*(x)\right] + \mathbb{E}_{x \sim p_g} \left[\log(1 - D_G^*(x))\right] \\
&= \mathbb{E}_{x \sim p_{\text{data}}} \left[\log \frac{p_{\text{data}}(x)}{p_{\text{data}}(x) + p_g(x)}\right] + \mathbb{E}_{x \sim p_g} \left[\frac{p_g(x)}{p_{\text{data}}(x) + p_g(x)}\right] \\
\end{split}
\end{equation}
By definition of KL divergence $\text{KL}(P \| Q) = \mathbb{E}_{x \sim P} \left[\log \frac{p(x)}{q(x)}\right]$, we have:
\begin{equation}
\begin{split}
V(G, D_G^*) &= \mathbb{E}_{x \sim p_{\text{data}}} \left[\log \frac{p_{\text{data}}(x)}{p_{\text{data}}(x) + p_g(x)}\right] + \mathbb{E}_{x \sim p_g} \left[\log \frac{p_g(x)}{p_{\text{data}}(x) + p_g(x)}\right] \\
&= -\log4 + \text{KL}\left(p_{\text{data}} \| \frac{p_\text{data} + p_g}{2}\right) + \text{KL}\left(p_g \| \frac{p_{\text{data}} + p_g}{2}\right) \\
&= -\log4 + 2 \cdot \text{JSD}(p_{\text{data}} \| p_g) \\
& \geq -\log4
\end{split}
\end{equation}
The equality holds if and only if $p_{\text{data}} = p_g$.
\end{question}
\section{Adversarial Attacks}
\begin{question}[Principle of Generating Adversarial Attacks]
\begin{equation}
\max_{\left\|x_{\text{adv}}-x\right\|_{\infty} \leq \epsilon} \mathcal{L}(C(x_{\text{adv}}), y)
\end{equation}
where $C$ is the composition of $h$ and $f$.
\end{question}
\begin{question}[Different Solvers] to optimize the adversarial attack.
\begin{itemize}
\item \textbf{Zero-Order Solvers} (only access to the output of NN)
\begin{itemize}
\item Black-box attack
\end{itemize}
\item \textbf{First-Order Solvers} (access to the gradient of NN)
\begin{itemize}
\item White-box attack
\item Fast Gradient Sign Method (FGSM), BIM, PGD, CW attack, \dots
\end{itemize}
\item \textbf{Second-Order Solvers} (access to the Hessian matrix)
\begin{itemize}
\item White-box attack
\item L-BFGS attack
\end{itemize}
\end{itemize}
\end{question}
\begin{question}[Holder Inequality]
For any $p, q \geq 1$ such that $\frac{1}{p} + \frac{1}{q} = 1$,
\begin{equation} \label{eq:holder-inequality}
\left\|x\right\|_p \cdot \left\|y\right\|_q \geq \left\langle x, y \right\rangle
\end{equation}
where $\left\langle x, y \right\rangle$ is the inner product.
$\| \cdot \|_p$ and $\| \cdot \|_q$ are also known as dual norms.
\begin{itemize}
\item $\| \cdot \|_2$ is self-dual.
\item $\| \cdot \|_\infty$ and $\| \cdot \|_1$ are dual norms.
\end{itemize}
\end{question}
\begin{question}[FGSM -- Fast Gradient Sign Method]
White-box and non-targeted (maximize the loss w.r.t. the true label).
Do linear expansion at $x$:
\begin{equation}
\mathcal{L}(C(x + \delta), y) \approx \underbrace{\mathcal{L}(C(x), y)}_{\text{constant}} + \nabla_x \mathcal{L}(C(x), y) \cdot \delta
\end{equation}
The problem reduces to:
\begin{equation}
\max_{\left\|\delta\right\|_{\infty} \leq \epsilon} \nabla_x \mathcal{L}(C(x), y) \cdot \delta
\end{equation}
Because of holder inequality \eqref{eq:holder-inequality}, we have:
\begin{equation}
\nabla_x \mathcal{L}(C(x), y) \cdot \delta \leq \left\|\delta\right\|_{\infty} \cdot \left\|\nabla_x \mathcal{L}(C(x), y)\right\|_1
\leq \epsilon \cdot \left\|\nabla_x \mathcal{L}(C(x), y)\right\|_1
\end{equation}
Thus, the adversarial example is generated by:
\begin{equation}
x_{\text{adv}} = x + \epsilon \cdot \text{sign}(\nabla_x \mathcal{L}(C(x), y))
\end{equation}
where $\epsilon$ is the perturbation size.
\end{question}
\begin{question}[BIM -- Basic Iterative Method]
BIM is an iterative version of FGSM.
\begin{itemize}
\item Initialize $x^{(0)} = x$.
\item For $k = 1, 2, \dots, K$:
\begin{equation}
x^{(k)} = x^{(k-1)} + \gamma \cdot \text{sign}(\nabla_x \mathcal{L}(C(x^{(k-1)}), y))
\end{equation}
\end{itemize}
Issues:
\begin{itemize}
\item By repeating, the pertubation size $\epsilon$ will become larger.
\item For a pre-defined $\epsilon$, $x^{(k)}$ may not satisfy $\|x^{(k)} - x\|_\infty \leq \epsilon$.
\end{itemize}
\end{question}
\begin{question}[PGD -- Projected Gradient Descent]
To resolve the issue of BIM, PGD involves a truncation operation:
\begin{itemize}
\item Initialize $x^{(0)} = x + \delta, \text{ where } \delta \in (-\epsilon, \epsilon)$.
\item For $k = 1, 2, \dots, K$:
\begin{equation}
x^{(k)} = \text{clip}_{(-\epsilon, \epsilon)}(x^{(k-1)} + \gamma \cdot \text{sign}(\nabla_x \mathcal{L}(C(x^{(k-1)}), y)))
\end{equation}
\end{itemize}
where $\text{clip}_{(-\epsilon, \epsilon)}(x)$ projects $x$ back to the $\ell_\infty$ ball of radius $\epsilon$ around $x$.
\end{question}
\begin{question}[Targeted PGD Attack]
Objective:
\begin{itemize}
\item \textbf{Untargeted}: \begin{equation}
\max_{\left\|\delta\right\|_{\infty} \leq \epsilon} \mathcal{L}(C(x + \delta), y_{\text{true}})
\end{equation}
\item \textbf{Targeted}: \begin{equation}
\begingroup\color{red}\min_{\left\|\delta\right\|_{\infty} \leq \epsilon}\endgroup \mathcal{L}(C(x + \delta), \begingroup\color{red}y_{\text{target}}\endgroup)
\end{equation}
\end{itemize}
\end{question}
\section{Adversarial Robustness}
\begin{question}[Defense Mechanisms]
Categorized into two types:
\begin{itemize}
\item \textbf{Gradient Masking}: hide the gradients and make first-order attacks fail
\begin{itemize}
\item \textbf{Shasttered Gradients}
By applying a non-smooth or non-differentiable preprocessor $g$ to the inputs,
and then training a DNN model $f$ on the preprocessed inputs $g(x)$.
\item \textbf{Stochastic/Randomized Gradients}
Apply some form of randomization of the DNN model.
E.g. train a set of classifiers and during the testing phase randomly select one classifier to predict the labels.
\end{itemize}
\item \textbf{Adversarial Training}:
\begin{equation}
\min_{C} \mathbb{E}_{(x, y) \sim \mathcal{D}^n} \left[ \max_{\left\|\delta\right\|_{\infty} \leq \epsilon} \mathcal{L}(C(x + \delta), y) \right]
\end{equation}
\end{itemize}
\end{question}
\begin{question}[Trade-Off between Natural and Robust Error]
\begin{equation}
\min_f R_{\text{nat}}(f) + R_{\text{rob}}(f) / \lambda
\end{equation}
\begin{equation}
\begin{split}
R_{\text{nat}}(f) &\coloneq \Pr_{x,y \sim \mathcal{D}} \{f(x) y \leq 0\} \\
&= \mathbb{E}_{(x, y) \sim \mathcal{D}} \left[ \mathbb{I}(f(x) y \leq 0) \right] \\
\end{split}
\end{equation}
\begin{equation}
\begin{split}
R_{\text{rob}}(f) &\coloneq \Pr_{x,y \sim \mathcal{D}} \{\exists \delta \in B_\epsilon(x) \text{ s.t. } f(x + \delta) y \leq 0\} \\
&= \mathbb{E}_{(x, y) \sim \mathcal{D}} \left[ \max_{\left\|\delta\right\|_{\infty} \leq \epsilon} \mathbb{I}(f(x + \delta) y \leq 0) \right] \\
\end{split}
\end{equation}
Approximate by a differentiable surrogate loss $\Phi$:
\begin{equation}
R_{\text{nat}}(f) \approx \mathbb{E}_{(x, y) \sim \mathcal{D}} \left[ \Phi(f(x) y) \right]
\end{equation}
\end{question}
\begin{question}[TRADES]
\begin{equation}
\min_f \underbrace{\left[ \underbrace{\mathbb{E}_{(x, y) \sim \mathcal{D}} \left[ \Phi(f(x) y) \right]}_{\text{minimize diff btw } f(x) \text{ and } y \text{ for accuracy}} + \underbrace{\mathbb{E}_{(x, y) \sim \mathcal{D}} \left[ \max_{\left\|\delta\right\|_{\infty} \leq \epsilon} \Phi(f(x + \delta) f(x)) \right]}_{\text{minimize diff btw }f(x) \text{ and } f(x+\delta){ for robustness}} / \lambda \right]}_{\text{TRADES Loss}}
\end{equation}
\end{question}
\section{Self-Supervised Learning}
\begin{question}[Contrastive Learning]
Loss:
\begin{equation}
\max_\Theta \Pr_1 = \frac{\exp(z_1)}{\sum_j \exp(z_j)}
\end{equation}
\end{question}
\end{document}