CS 480/680: Introduction to Machine Learning

Spring 25
This is not a hard course.
\documentclass{article}
\usepackage{fullpage}
\usepackage{float}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{url, hyperref}
\usepackage{algorithm2e}
\usepackage[margin=0.25in]{geometry}
\usepackage{pgfplots}
\pgfplotsset{width=10cm,compat=1.18}

\usepgfplotslibrary{external}
\tikzexternalize

\lstset{basicstyle=\fontfamily{pcr}\footnotesize}

\graphicspath{./}

\newcommand{\paren}[1]{\left( #1 \right)}
\newcommand{\iprod}[1]{\left\langle #1 \right\rangle}

\newcommand*\MY@rightharpoonupfill@{%
    \arrowfill@\relbar\relbar\rightharpoonup
}
\newcommand*\overrightharpoon{%
    \mathpalette{\overarrow@\MY@rightharpoonupfill@}%
}

\theoremstyle{definition}
\newtheorem{question}{Question}

\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator{\tr}{tr}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Var}{Var}
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
\newcommand{\llnorm}[1]{\left\| #1 \right\|_2}
\newcommand{\mnorm}[1]{\left\| #1 \right\|_1}
\newcommand{\fnorm}[1]{\left\| #1 \right\|_F}

\title{\large CS480/680, Spring 2025\\\huge Review Notes}

\author{}
\date{\today}
\setlength\parindent{0pt}

%\input{preamble}

\begin{document}

\maketitle

\newpage

\part{For Mid-Term}

\section{Perceptron}

\begin{question}[Linear Function]
    \begin{equation}
        \begin{split}
            & \forall \alpha \beta \in \mathbb{R}, \forall \boldsymbol{x}, \boldsymbol{z} \in \mathbb{R}^d, 
            f(\alpha \boldsymbol{x} + \beta \boldsymbol{z}) = \alpha \cdot f(\boldsymbol{X}) + \beta \cdot f(\boldsymbol{z}) \\
            \Longleftrightarrow
            & \exists \boldsymbol{w} \in \mathbb{R}^d, f(\boldsymbol{x}) = \iprod{\boldsymbol{x}, \boldsymbol{w}}
        \end{split}
    \end{equation}

    \textbf{Proof $\Rightarrow$:}
    Let $\boldsymbol{w} \coloneq \left[ f(\boldsymbol{e_1}), \dots, f(\boldsymbol{e_d}) \right]^T$, where
    $\boldsymbol{e_i}$ is the $i$-th coordinate vector.
    \begin{equation}
        \begin{split}
            f(\boldsymbol{x}) & = f( x_1 \boldsymbol{e_1} + \dots + x_d \boldsymbol{e_d} ) \\
            & = x_1 f(\boldsymbol{e_1}) + \dots + x_d f(\boldsymbol{e_d}) \\
            & = \iprod{\boldsymbol{x}, \boldsymbol{w}}
        \end{split}
    \end{equation}

    \textbf{Proof $\Leftarrow$:}
    \begin{equation}
        \begin{split}
            f(\alpha \boldsymbol{x} + \beta \boldsymbol{z}) & = \iprod{\alpha \boldsymbol{x} + \beta \boldsymbol{z}, \boldsymbol{w}} \\
            & = \iprod{\alpha \boldsymbol{x}, \boldsymbol{w}} + \iprod{\beta \boldsymbol{z}, \boldsymbol{w}} \\
            & = \alpha \iprod{\boldsymbol{x}, \boldsymbol{w}} + \beta \iprod{\boldsymbol{z}, \boldsymbol{w}} \\
            & = \alpha f(\boldsymbol{x}) + \beta f(\boldsymbol{z})
        \end{split}
    \end{equation}
\end{question}

\begin{question}[$\boldsymbol{w}$ is Orthogonal to Decision Boundary $H$]
    Any vector on $H$ can be written as $\overrightharpoon{\boldsymbol{x}\boldsymbol{x'}} = \boldsymbol{x'} - \boldsymbol{x}$,
    \begin{equation}
            \iprod{\boldsymbol{x'} - \boldsymbol{x}, \boldsymbol{w}} 
            = \iprod{\boldsymbol{x'}, \boldsymbol{w}} - \iprod{\boldsymbol{x}, \boldsymbol{w}}
            = -b - (-b) = 0
    \end{equation}
\end{question}

\begin{question}[Update Rule for Perceptron]
\begin{equation} \label{perceptron-update-rule}
    \begin{split}
        \boldsymbol{w} \leftarrow \boldsymbol{w} + y_i \boldsymbol{x_i} \\
        b \leftarrow = b + y_i
    \end{split}
\end{equation}
\end{question}

\begin{question}[Feasibility of Perceptron]
The goal is to find $\boldsymbol{w} \in \mathbb{R}^d, b \in \mathbb{R}$, 
such that $\forall i, y_i \paren{\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b} > 0$.

According to the update rule \ref{perceptron-update-rule},
\begin{equation}
    \begin{split}
        y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{new}}} + b_{new} \right]
        &= y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{old} + y \boldsymbol{x}}} + b_{old} + y \right] \\
        &= y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{old}}} + b_{old} \right] + y\left[ \iprod{\boldsymbol{x} ,y \boldsymbol{x}} + y \right] \\
        &= y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{old}}} + b_{old} \right] + y\left[ y \llnorm{\boldsymbol{x}}^2 + y \right] \\
        &= y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{old}}} + b_{old} \right] + y^2 \llnorm{\boldsymbol{x}}^2 + y^2 \\
        &= y\left[ \iprod{\boldsymbol{x}, \boldsymbol{w_{old}}} + b_{old} \right] + \underbrace{\llnorm{\boldsymbol{x}}^2 + 1}_{\text{always positive}}
    \end{split}
\end{equation}
Notice that $y \in \left\{ \pm 1 \right\} \Rightarrow y^2 = 1$.

$\llnorm{\boldsymbol{x}}^2 + 1$ is always positive, which means we always increase the confidence $y \hat{y}$ after the update.
\end{question}

\begin{question}[Trick for Hiding the Bias Term -- Padding]
    \begin{equation}
        \begin{split}
            \iprod{\boldsymbol{x}, \boldsymbol{w}} + b 
            & = \iprod{ \begin{pmatrix} \boldsymbol{x} \\ 1 \end{pmatrix}, \begin{pmatrix} \boldsymbol{w} \\ b \end{pmatrix} } \\
            & = \iprod{ \boldsymbol{x_{pad}}, \boldsymbol{w_{pad}}}
        \end{split}
    \end{equation}
    Correspondingly, the update rule can be written as:
    \begin{equation}
        \boldsymbol{w_{pad}} \leftarrow \boldsymbol{w_{pad}} + y \boldsymbol{x_{pad}}
    \end{equation}
\end{question}

\begin{question}[Margin]
    Suppose $\exists \boldsymbol{w^*}$ such that $\forall i, y_i \iprod{\boldsymbol{x_i}, \boldsymbol{w^*}} > 0$.

    We normalize $\boldsymbol{w^*}$ such that $\llnorm{\boldsymbol{w^*}} = 1$.

    In other words, $\boldsymbol{w^*}$ is the normalized weight for the deicision boundary.

    \begin{equation} \label{margin}
        \text{Margin } \gamma \coloneq \min_i \left| \iprod{\boldsymbol{x_i}, \boldsymbol{w^*}} \right|
    \end{equation}
\end{question}

\begin{question}[Convergence Theorem -- Linearly Separable Case]
    Assume that $\forall i, \llnorm{\boldsymbol{x_i}} \leq C$ (i.e. within a circle with radius $C$).

    Then the Perceptron algorithm converges after $\frac{C^2}{\gamma^2}$ mistakes.

    \vspace{5mm}
    \textbf{Proof:}

    Suppose $\boldsymbol{w}$ is the updating weight, and $\theta$ is the angle between $\boldsymbol{w}$ and $\boldsymbol{w^*}$. 

    We have $\iprod{\boldsymbol{w}, \boldsymbol{w^*}} = \llnorm{\boldsymbol{w}} \llnorm{\boldsymbol{w^*}} \cos{\theta} = \llnorm{\boldsymbol{w}} \cos{\theta}$.
    
    After an update, $\llnorm{\boldsymbol{w_{new}}} \cos{\theta_{new}}$ will be
    \begin{equation}
        \begin{split}
            \iprod{\boldsymbol{w} + y \boldsymbol{x}, \boldsymbol{w^*}}
            & = \iprod{\boldsymbol{w}, \boldsymbol{w^*}} + y \iprod{\boldsymbol{x}, \boldsymbol{w^*}} \\
            & = \iprod{\boldsymbol{w}, \boldsymbol{w^*}} + \left|\iprod{\boldsymbol{x}, \boldsymbol{w^*}}\right| \\
            & \geq \iprod{\boldsymbol{w}, \boldsymbol{w^*}} + \gamma
        \end{split}
    \end{equation}

    Let's see the change of $\iprod{\boldsymbol{w_{new}}, \boldsymbol{w_{new}}} = \llnorm{\boldsymbol{w_{new}}}^2$,
    \begin{equation}
        \begin{split}
            \iprod{\boldsymbol{w} + y \boldsymbol{x}, \boldsymbol{w} + y \boldsymbol{x}}
            & = \iprod{\boldsymbol{w}, \boldsymbol{w}} + 2y \iprod{\boldsymbol{w}, \boldsymbol{x}} + y^2 \iprod{\boldsymbol{x}, \boldsymbol{x}} \\
            & = \llnorm{\boldsymbol{w}}^2 + 2y \iprod{\boldsymbol{w}, \boldsymbol{x}} + \llnorm{\boldsymbol{x}}^2
        \end{split}
    \end{equation}
    Because $y\iprod{\boldsymbol{w}, \boldsymbol{x}} < 0$ and $\llnorm{\boldsymbol{x}} \leq C$,
    \begin{equation}
        \begin{split}
            \iprod{\boldsymbol{w} + y \boldsymbol{x}, \boldsymbol{w} + y \boldsymbol{x}}
            & = \llnorm{\boldsymbol{w}}^2 + 2y \iprod{\boldsymbol{w}, \boldsymbol{x}} + \llnorm{\boldsymbol{x}}^2 \\
            & \leq \llnorm{\boldsymbol{w}}^2 + C^2
        \end{split}
    \end{equation}

    Finally, suppose it converges after $M$ updates, we have $\iprod{\boldsymbol{w}, \boldsymbol{w^*}} \geq M \gamma$ and $\llnorm{\boldsymbol{w}}^2 \leq M C^2$
    \begin{equation}
        \begin{split}
            1 = \cos{\theta} & = \frac{\iprod{\boldsymbol{w}, \boldsymbol{w^*}}}{\llnorm{\boldsymbol{w}} \llnorm{\boldsymbol{w^*}}} \\
            & \geq \frac{M \gamma}{\sqrt{M C^2} \times 1} \\
            & = \sqrt{M} \frac{\gamma}{C}
        \end{split}
    \end{equation}

    which means $M \leq \frac{C^2}{\gamma^2}$.
\end{question}

\begin{question}[Perceptron Loss]
    \begin{equation}
        \begin{split}
            l(\boldsymbol{w}, \boldsymbol{x_t}, y_t) 
            & = -y_t \iprod{\boldsymbol{w}, \boldsymbol{x_t}} \mathbb{I}[\text{mistake on }\boldsymbol{x_t}] \\
            & = -\min \left\{ y_t \iprod{\boldsymbol{w}, \boldsymbol{x_i}}, 0 \right\}
        \end{split}
    \end{equation}
    \begin{equation}
        L(\boldsymbol{w}) = -\frac{1}{n} \sum_{t=1}^n y_t \iprod{\boldsymbol{w}, \boldsymbol{x_t}} \mathbb{I} [\text{mistake on }\boldsymbol{x_t}]
    \end{equation}
\end{question}

\section{Linear Regression}

\begin{question}[Least Square Regression]
   \begin{equation}
    \min_{f: \mathcal{X} \rightarrow \mathcal{Y}} \mathbb{E} \llnorm{f(\boldsymbol{X}) - Y}^2
   \end{equation} 

   The optimal regression function is
   \begin{equation}
    f^*(\boldsymbol{x}) = m(x) = \mathbb{E} [Y | \boldsymbol{X} = \boldsymbol{x}]
   \end{equation}

   Calculating it needs to know the distribution, i.e., all pairs $(\boldsymbol{X}, Y)$.
\end{question}

\begin{question}[Bias-Variance Decomposition]
    \begin{equation}
        \begin{split}
            \mathbb{E} \llnorm{f(\boldsymbol{X}) - Y}^2
            & = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x) + m(x) - Y}^2 \\
            & = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x)}^2 + \mathbb{E} \llnorm{m(x) - Y}^2 + 2 \mathbb{E} \iprod{f(\boldsymbol{X}) - m(x), m(x) - Y} \\
            & = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x)}^2 + \mathbb{E} \llnorm{m(x) - Y}^2 + \mathbb{E} \; \mathbb{E}_{Y|\boldsymbol{X}} \left[ \iprod{f(\boldsymbol{X}) - m(x), m(x) - Y} \right] \\
            & = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x)}^2 + \mathbb{E} \llnorm{m(x) - Y}^2 + \mathbb{E} \iprod{f(\boldsymbol{X}) - m(x), m(x) - \mathbb{E}_{Y|\boldsymbol{X}} [Y]} \\
            & = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x)}^2 + \mathbb{E} \llnorm{m(x) - Y}^2 + \mathbb{E} \iprod{f(\boldsymbol{X}) - m(x), m(x) - m(x)} \\
            & = \mathbb{E} \llnorm{f(\boldsymbol{X}) - m(x)}^2 + \underbrace{\mathbb{E} \llnorm{m(x) - Y}^2}_{\text{noise (variance)}}
        \end{split}
    \end{equation} 

    The last term is the noise (variance), irrelevant to $f$. 
    So, to minimize the squared error, we need $f \approx m$.

    However, $m(\boldsymbol{x})$ is incalculable, because $\mathbb{E} [Y | \boldsymbol{X} = \boldsymbol{x}]$ is unknown.

    Let's learn $f_D$ from the training data $D$. Define $\bar{f}(\boldsymbol{X}) = \mathbb{E}_D[f_D(\boldsymbol{X})]$.
    \begin{equation}
        \begin{split}
            \underbrace{\mathbb{E}_{\boldsymbol{X}, Y, D} \llnorm{f_D(\boldsymbol{X}) - Y}^2}_{\text{test error}}
            & = \mathbb{E}_{\boldsymbol{X}} \llnorm{f_D(\boldsymbol{X}) - m(x)}^2 + \underline{\mathbb{E}_{\boldsymbol{X}, Y} \llnorm{m(x) - Y}^2} \\
            & = \mathbb{E}_{\boldsymbol{X}, D} \llnorm{f_D(\boldsymbol{X}) - \bar{f}(\boldsymbol{X}) + \bar{f}(\boldsymbol{X}) - m(x)}^2 + \underline{\mathbb{E}_{\boldsymbol{X}, Y} \llnorm{m(x) - Y}^2} \\
            & = \mathbb{E}_{\boldsymbol{X}, D} \llnorm{f_D(\boldsymbol{X}) - \bar{f}(\boldsymbol{X})}^2 + \mathbb{E}_{\boldsymbol{X}} \llnorm{\bar{f}(\boldsymbol{X}) - m(x)}^2 \\
               & \qquad + 2 \mathbb{E}_{\boldsymbol{X}, D} \iprod{f_D(\boldsymbol{X}) - \bar{f}(\boldsymbol{X}), \bar{f}(\boldsymbol{X}) - m(x)} \\
               & \qquad + \underline{\mathbb{E}_{\boldsymbol{X}, Y} \llnorm{m(x) - Y}^2} \\
            & = \dots + 2 \mathbb{E}_{\boldsymbol{X}} \mathbb{E}_{D} \iprod{f_D(\boldsymbol{X}) - \bar{f}(\boldsymbol{X}), \bar{f}(\boldsymbol{X}) - m(x)} + \underline{\dots} \\
            & = \dots + 2 \mathbb{E}_{\boldsymbol{X}} \iprod{\mathbb{E}_{D} [f_D(\boldsymbol{X})] - \bar{f}(\boldsymbol{X}), \bar{f}(\boldsymbol{X}) - m(x)} + \underline{\dots} \\
            & = \dots + 2 \mathbb{E}_{\boldsymbol{X}} \iprod{\bar{f}(\boldsymbol{X}) - \bar{f}(\boldsymbol{X}), \bar{f}(\boldsymbol{X}) - m(x)} + \underline{\dots} \\
            & = \dots + 0 + \underline{\dots} \\
            & = \mathbb{E}_{\boldsymbol{X}, D} \llnorm{f_D(\boldsymbol{X}) - \bar{f}(\boldsymbol{X})}^2 + \mathbb{E}_{\boldsymbol{X}} \llnorm{\bar{f}(\boldsymbol{X}) - m(x)}^2 + \underline{\mathbb{E}_{\boldsymbol{X}, Y} \llnorm{m(x) - Y}^2} \\
            & = \underbrace{\mathbb{E}_{\boldsymbol{X}, D} \llnorm{f_D(\boldsymbol{X}) - \mathbb{E}_D [f_D(\boldsymbol{X})]}^2}_{\text{variance}} + \underbrace{\mathbb{E}_{\boldsymbol{X}} \llnorm{\mathbb{E}_D [f_D(\boldsymbol{X})] - m(x)}^2}_{\text{bias}^2} + \underbrace{\mathbb{E}_{\boldsymbol{X}, Y} \llnorm{m(x) - Y}^2}_{\text{noise (variance)}} \\
        \end{split}
    \end{equation}
\end{question}

\begin{question}[Sampling $\rightarrow$ Training]
    Replace expectation with sample average: $\paren{\boldsymbol{X_i}, Y_i} \tilde P$.
    \begin{equation}
        \min_{f: \mathcal{X} \rightarrow \mathcal{Y}} \hat{\mathbb{E}} \llnorm{f(\boldsymbol{X}) - Y}^2 \coloneq \frac{1}{n} \sum_{i=1}^n \llnorm{f(\boldsymbol{X_i}) - Y_i}^2
    \end{equation}
    Uniform law of large numbers: as training data size $n \rightarrow \argmin \mathbb{E}$, 

    $\hat{\mathbb{E}} \rightarrow \mathbb{E}$ and (hopefully) $\argmin \hat{\mathbb{E}} \rightarrow \mathbb{E}$.
\end{question}

\begin{question}[Linear Regression]
    Padding: $\boldsymbol{x} \leftarrow \begin{pmatrix}\boldsymbol{x} \\ 1\end{pmatrix}$, $W \leftarrow \begin{pmatrix}W \\ \boldsymbol{b}\end{pmatrix}$
    \begin{align*}
    X & = [\boldsymbol{x_1}, \dots, \boldsymbol{x_n}] \in \mathbb{R}^{(d+1) \times n}, \\
    Y & = [\boldsymbol{y_1}, \dots, \boldsymbol{y_n}] \in \mathbb{R}^{t \times n}, \\
    W & \in \mathbb{R}^{t \times (d+1)}, \\
    \fnorm{A} & = \sqrt{\sum_{ij} a_{ij}^2}
    \end{align*}
    Linear regression is:
    \begin{equation}
        \min_{W \in \mathbb{R}^{t \times (d+1)}} \frac{1}{n} \fnorm{WX - Y}^2
    \end{equation}
\end{question}

\begin{question}[Optimality Condition]
    If $\boldsymbol{w}$ is a minimizer (or maximizer) of a differentiable function $f$
    \textbf{over an open set}, then $f'(\boldsymbol{w}) = 0$.
\end{question}

\begin{question}[Solving Linear Regression]
    \begin{equation}
        L(W) = \frac{1}{n} \fnorm{WX-Y}^2
    \end{equation}
    \begin{equation}
        \begin{split}
            \nabla_W L(W) & = \frac{2}{n} (WX-Y) X^T = 0 \\
            & \Rightarrow W X X^T = Y X^T \\
            & \Rightarrow W = Y X^T (X X^T)^{-1} \\
        \end{split}
    \end{equation}
\end{question}

\begin{question}[Ill-Conditioning]
    Slight pertubation leads to chaotic behavior, which happens whenever
    $X$ is ill-conditioned, i.e., (close to) rank-deficient.

    \vspace{5mm}
    Rank-deficient X means:
    \begin{enumerate}
        \item two columns in $X$ are linearly dependent (or simply the same)
        \item but the corresponding $y$ might be different
    \end{enumerate}
\end{question}

\begin{question}[Ridge Regression]
    \begin{equation}
        \min_{W} \frac{1}{n} \fnorm{WX - Y}^2 + \lambda \fnorm{W}^2
    \end{equation}
    \begin{equation}
        \begin{split}
            \nabla_W L(W) & = \frac{2}{n} (WX - Y) X^T + 2 \lambda W = 0 \\
            & \Rightarrow W X X^T - Y X^T + \lambda W = 0 \\
            & \Rightarrow W (X X^T + n \lambda I) = Y X^T
        \end{split}
    \end{equation}
    \begin{equation}
        \begin{split}
            & X = U \Sigma V^T \\
            \Rightarrow & X X^T = U \Sigma (V^T V) \Sigma U^T = U \Sigma^2 U^T \\
            \Rightarrow & X X^T + n \lambda I = U \underbrace{(\Sigma^2 + n \lambda I)}_{\text{strictly positive}} U^T \\
            \Rightarrow & X X^T + n \lambda I \text{ is of full-rank}
        \end{split}
    \end{equation}

    $\lambda$ is regularization parameter. $\lambda = \infty \Rightarrow W \equiv \boldsymbol{0}$.
\end{question}

\begin{question}[Regularization $\equiv$ Data Augmentation]
    \begin{equation}
        \frac{1}{n} \fnorm{WX - Y}^2 + \lambda \fnorm{W}^2
        = \frac{1}{n} \fnorm{W \left[X \quad \sqrt{n\lambda} I \right] - \left[Y \quad \boldsymbol{0}\right]}^2
    \end{equation}
\end{question}

\section{Logistic Regression}

\begin{question}[Max Likelihood Estimation]
    Let $\mathcal{Y} = \{0, 1\}$. 
    Learn confidence $p(\boldsymbol{x}; \boldsymbol{w}) \coloneq \Pr (Y=1|X=\boldsymbol{x})$.

    \begin{equation}
        \begin{split}
            \max_{\boldsymbol{w}} \Pr (Y_1 = y_1, \dots, Y_n = y_n)
            & = \max_{\boldsymbol{w}} \prod_{i=1}^n \Pr (Y_i = y_i | X_i = x_i) \\
            & \stackrel{\mathcal{Y}=\{0, 1\}}{=} \max_{\boldsymbol{w}} \prod_{i=1}^n \left[p(\boldsymbol{x_i}; \boldsymbol{w})\right]^{y_i} \left[1 - p(\boldsymbol{x_i}; \boldsymbol{w})\right]^{1 - y_i} \\
        \end{split}
    \end{equation}

    Use negative log-likelihood:

    \begin{equation}
        \min_{\boldsymbol{w}} \sum_{i=1}^n \left[ -y_i \log p(\boldsymbol{x_i}; \boldsymbol{w}) - (1 - y_i) \log (1 - p(\boldsymbol{x_i}; \boldsymbol{w})) \right]
    \end{equation}
\end{question}

\begin{question}[Odds Ratio and Sigmoid]
    \begin{equation}
        \text{Odds Ratio} = \frac{\Pr}{1 - \Pr}
    \end{equation}

    Assume $\log \frac{p(\boldsymbol{x};\boldsymbol{w})}{1 - p(\boldsymbol{x};\boldsymbol{w})} = \iprod{\boldsymbol{x}, \boldsymbol{w}}$.

    \vspace{5mm}
    The Sigmoid transformation is:
    \begin{equation}
        p(\boldsymbol{x}; \boldsymbol{w}) = \frac{1}{1 + \exp (-\iprod{\boldsymbol{x}, \boldsymbol{w}})}
    \end{equation}
\end{question}

\begin{question}[Logistic Regression]
    Plug the sigmoid in the negative log-likelihood:
    \begin{equation}
        \begin{split}
        & \min_{\boldsymbol{w}} \sum_{i=1}^n \left[ -y_i \log p(\boldsymbol{x_i}; \boldsymbol{w}) - (1 - y_i) \log (1 - p(\boldsymbol{x_i}; \boldsymbol{w})) \right] \\
        = & \min_{\boldsymbol{w}} \sum_{i=1}^n \left[ y_i \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] - (1 - y_i) \log \left(1 - \frac{1}{1 + \exp (-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})}\right) \right] \\
        = & \min_{\boldsymbol{w}} \sum_{i=1}^n \left[ y_i \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] - (1 - y_i) \log \left(\frac{\exp (-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})}{1 + \exp (-\iprod{\boldsymbol{x}, \boldsymbol{w}})}\right) \right] \\
        = & \min_{\boldsymbol{w}} \sum_{i=1}^n \left[ y_i \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] + (1 - y_i) \left[\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + \log(1 + \exp (-\iprod{\boldsymbol{x}, \boldsymbol{w}}))\right] \right] \\
        = & \min_{\boldsymbol{w}} \sum_{i=1}^n \left[ \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] + (1 - y_i) \iprod{\boldsymbol{x_i}, \boldsymbol{w}} \right] \\
        \end{split}
    \end{equation}
    Because $y_i \in \{0, 1\}$, let's map it to $\{\pm 1\}$.
    \begin{equation}
        \begin{split}
            L(\boldsymbol{w})
            & \stackrel{y_i \in \{0, 1\}}{=} \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] + (1 - y_i) \iprod{\boldsymbol{x_i}, \boldsymbol{w}} \\
            & \stackrel{y_i \in \{0, 1\}}{=} \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] + \log \left[ \exp ( (1 - y_i) \iprod{\boldsymbol{x_i}, \boldsymbol{w}} ) \right] \\
            & \stackrel{y_i \in \{0, 1\}}{=} \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}}) \cdot \exp ( (1 - y_i) \iprod{\boldsymbol{x_i}, \boldsymbol{w}} ) ] \\
            & \stackrel{y_i \in \{0, 1\}}{=} \log [\exp((1-y_i)\iprod{\boldsymbol{x_i}, \boldsymbol{w}}) + \exp(-y_i \iprod{\boldsymbol{x_i}, \boldsymbol{w}})] \\
            & \stackrel{y_i \in \{0, 1\}}{=} \begin{cases}
                \log [\exp(\iprod{\boldsymbol{x_i}, \boldsymbol{w}}) + 1] & y_i = 0 \\
                \log [1 + \exp(-\iprod{\boldsymbol{x_i}, \boldsymbol{w}})] & y_i = 1
            \end{cases} \\
            & \stackrel{y_i \in \{\pm 1\}}{=} \log [ 1 + \exp(-y_i \iprod{\boldsymbol{x_i}, \boldsymbol{w}}) ]
        \end{split}
    \end{equation}
\end{question}

\begin{question}[Multi-Class: Sigmoid $\rightarrow$ Softmax]
    \begin{equation}
        \Pr(Y=k | X = \boldsymbol{x}; \boldsymbol{W} = [\boldsymbol{w_1}, \dots, \boldsymbol{w_c}])
        = \frac{\exp(\iprod{\boldsymbol{x}, \boldsymbol{w_k}})}{\sum_{l=1}^c \exp(\iprod{\boldsymbol{x}, \boldsymbol{w_l}})}
    \end{equation}
    Maximum likelihood estimation (log loss, cross-entropy loss):
    \begin{equation}
        \min_{\boldsymbol{W}} \sum_{i=1}^n \left[ -\log \frac{\exp(\iprod{\boldsymbol{x}, \boldsymbol{w_k}})}{\sum_{l=1}^c \exp(\iprod{\boldsymbol{x}, \boldsymbol{w_l}})} \right]
    \end{equation}
\end{question}

\section{Hard-Margin Support Vector Machines}

\begin{question}[Distance from a Point to a Hyperplane]
    Let $H \coloneq \{\boldsymbol{x}: \iprod{\boldsymbol{x}, \boldsymbol{w}} + b = 0\}$, $\boldsymbol{x}$ be any vector in $H$.
    \begin{equation}
        \text{Distance}(\boldsymbol{x_i}, \boldsymbol{w})
        = \frac{|\iprod{\boldsymbol{x_i} - \boldsymbol{x}, \boldsymbol{w}}|}{\llnorm{\boldsymbol{w}}}
        = \frac{|\iprod{\boldsymbol{x_i}, \boldsymbol{w}} - \iprod{\boldsymbol{x}, \boldsymbol{w}}|}{\llnorm{\boldsymbol{w}}}
        \stackrel{\boldsymbol{x} \in H}{=} \frac{|\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b|}{\llnorm{\boldsymbol{w}}}
        \stackrel{y_i \hat{y_i} > 0}{=} \frac{y_i \hat{y_i}}{\llnorm{\boldsymbol{w}}}
    \end{equation}
\end{question}

\begin{question}[Margin Maximization]
    Margin is the smallest distance to $H$ among all separable data.
    \begin{equation}
        \max_{\boldsymbol{w}, b} \min_i \frac{y_i \hat{y_i}}{\llnorm{\boldsymbol{w}}} 
        \text{, such that }
        \forall i, y_i \hat{y_i} > 0
    \end{equation}
    Let $c > 0$, then $\boldsymbol{w} = c\boldsymbol{w}, b = cb$ keeps the loss same:
    \begin{equation}
        \begin{split}
            \max_{\boldsymbol{w}, b} \min_i \frac{c y_i \hat{y_i}}{\llnorm{c\boldsymbol{w}}} 
            & = \max_{\boldsymbol{w}, b} \min_i \frac{y_i (\iprod{\boldsymbol{x}, c\boldsymbol{w}} + cb)}{\llnorm{c\boldsymbol{w}}} \\
            & = \max_{\boldsymbol{w}, b} \min_i \frac{c y_i (\iprod{\boldsymbol{x}, \boldsymbol{w}} + b)}{c \llnorm{\boldsymbol{w}}} \\
            & = \max_{\boldsymbol{w}, b} \min_i \frac{y_i (\iprod{\boldsymbol{x}, \boldsymbol{w}} + b)}{\llnorm{\boldsymbol{w}}} \\
            & = \max_{\boldsymbol{w}, b} \min_i \frac{y_i \hat{y_i}}{\llnorm{\boldsymbol{w}}} 
        \end{split}
    \end{equation}
    Let $c = \frac{1}{\min_i y_i \hat{y_i}}$,
    \begin{equation}
        \begin{split}
            \max_{\boldsymbol{w}, b} \min_i \frac{c y_i \hat{y_i}}{c \llnorm{\boldsymbol{w}}} 
            & = \max_{\boldsymbol{w}, b} \frac{1}{c \llnorm{\boldsymbol{w}}} \\
            & = \max_{\boldsymbol{w}, b} \frac{1}{\llnorm{\boldsymbol{w}}} \text{ s.t. } \min_i y_i \hat{y_i} = 1
        \end{split}
    \end{equation}
    Max $\rightarrow$ Min:
    \begin{equation}
        \min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 \text{ s.t. } \forall i, y_i \hat{y_i} \geq 1
    \end{equation}
\end{question}

\begin{question}[Hard-Margin SVM v.s. Perceptron]
    \begin{align}
        \text{Hard-Margin SVM:} \qquad
        & \min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 & \text{ s.t. } \forall i, y_i \hat{y_i} \geq 1 \\
        \text{Perceptron:} \qquad
        & \min_{\boldsymbol{w}, b} 0 & \text{ s.t. } \forall i, y_i \hat{y_i} \geq 1
    \end{align}
\end{question}

\begin{question}[Lagrangian Dual]
    Dual variables $\boldsymbol{\alpha} \in \mathbb{R}^n$.
    \begin{equation}
        \begin{split}
            \min_{\boldsymbol{w}, b} \; \max_{\boldsymbol{\alpha} \geq 0} \; \frac{1}{2} \llnorm{\boldsymbol{w}}^2 - \sum_i \alpha_i \left[ y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) - 1 \right] 
            & = \min_{\boldsymbol{w}, b} \; \begin{cases}
                + \infty, & \text{if } \exists i, y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) < 1 \quad (\alpha_i = + \infty) \\
                \frac{1}{2} \llnorm{\boldsymbol{w}}^2 , & \text{if } \forall i, y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) \geq 1 \quad (\forall i, \alpha_i = 0) \\
            \end{cases} \\
            & = \min_{\boldsymbol{w}, b} \; \frac{1}{2} \llnorm{\boldsymbol{w}}^2, \quad \text{s.t. } \forall i, y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) \geq 1 \\
            & = \min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 \text{ s.t. } \forall i, y_i \hat{y_i} \geq 1
        \end{split}
    \end{equation}

    Swap $\min$ and $\max$:
    \begin{equation} \label{lagrangian-max-min}
        \max_{\boldsymbol{\alpha} \geq 0} \; \min_{\boldsymbol{w}, b} \; \frac{1}{2} \llnorm{\boldsymbol{w}}^2 - \sum_i \alpha_i \left[ y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) - 1 \right] 
    \end{equation}

    Solve inner problem by setting derivative to 0:
    \begin{equation} \label{solve-derivative-for-svm-dual}
        \frac{\delta}{\delta \boldsymbol{w}} = \boldsymbol{w} - \sum_i \alpha_i y_i \boldsymbol{x_i} = 0,
        \qquad
        \frac{\delta}{\delta b} = - \sum_i \alpha_i y_i = 0,
    \end{equation}

    Plug them into the loss:
    \begin{equation}
        \begin{split}
            L(\boldsymbol{\alpha}) 
            & = \min_{\boldsymbol{w}, b} \; \frac{1}{2} \llnorm{\boldsymbol{w}}^2 - \sum_i \alpha_i \left[ y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) - 1 \right] \\
            & = \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2 - \sum_i \alpha_i y_i \iprod{\boldsymbol{x_i}, \boldsymbol{w}} - \sum_i \alpha_i y_i b + \sum_i \alpha_i \\
            & = \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2 - \iprod{\sum_i \alpha_i y_i \boldsymbol{x_i}, \sum_i \alpha_i y_i \boldsymbol{x_i}} - b \sum_i \alpha_i y_i + \sum_i \alpha_i \\
            & = \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2 - \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2 + \sum_i \alpha_i \\
            & = \sum_i \alpha_i - \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2,
            \qquad \text{s.t. } \sum_i \alpha_i y_i = 0
        \end{split}
    \end{equation}

    So, \ref{lagrangian-max-min} is solved as:
    \begin{equation}
        \max_{\boldsymbol{\alpha} \geq 0} \sum_i \alpha_i - \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2, 
        \qquad \text{s.t. } \sum_i \alpha_i y_i = 0
    \end{equation}

    Max $\rightarrow$ min and expand the norm:
    \begin{equation}
        \min_{\boldsymbol{\alpha} \geq 0} - \sum_i \alpha_i + \frac{1}{2} \sum_i \sum_j \alpha_i \alpha_j y_i y_j \underbrace{\iprod{\boldsymbol{x_i}, \boldsymbol{x_j}}}_{\text{Kernel, closed form w.r.t. }\boldsymbol{x_i}, \boldsymbol{x_j}}, 
        \qquad \text{s.t. } \sum_i \alpha_i y_i = 0
    \end{equation}
\end{question}

\begin{question}[Support Vectors]
    From \ref{solve-derivative-for-svm-dual}, we know $\boldsymbol{w} = \sum_i \alpha_i y_i \boldsymbol{x_i}$.

    Vectors with $\alpha_i \neq 0$ are support vectors, which lie on the margin.
\end{question}

\section{Soft-Margin Support Vector Machines}

\begin{question}[Goal]
    minimize over $\boldsymbol{w}, b$,
    \begin{equation}
        \Pr(Y \neq \sign(\hat Y)) = \Pr(Y \hat Y \leq 0)
        = \mathbb{E} \; \underbrace{\mathbb{I} [Y \hat Y \leq 0]}_{indicator function}
        \coloneq \mathbb{E} \; l_{0-1}(Y \hat Y)
    \end{equation}
    where $\hat Y = \iprod{X, \boldsymbol{w}} + b, Y = \pm 1$.

    \begin{equation}
        \begin{split}
            \min_{\hat Y: \mathcal{X} \rightarrow \mathbb{R}} \mathbb{E} \; l_{0-1}(Y \hat Y)
            & = \min_{\hat Y: \mathcal{X} \rightarrow \mathbb{R}} \mathbb{E}_X \mathbb{E}_{Y|X} \; l_{0-1}(Y \hat Y) \\
            & = \mathbb{E}_X \min_{\hat Y: \mathcal{X} \rightarrow \mathbb{R}} \mathbb{E}_{Y|X} \; l_{0-1}(Y \hat Y) \\
        \end{split}
    \end{equation}

    Minimizing the 0-1 error is \textbf{NP-hard}.
\end{question}

\begin{question}[Bayes Rule]
    \begin{equation}
        \eta(\boldsymbol{x}) \coloneq \argmax_{\hat y \in \mathbb{R}} \Pr (Y = \hat y | X = \boldsymbol{x})
    \end{equation}
    \begin{equation}
        \eta(\boldsymbol{x}) \coloneq \argmin_{\hat y \in \mathbb{R}} \mathbb{E}_{Y|X=\boldsymbol{x}} \; l_{0-1}(Y \hat y)
    \end{equation}
\end{question}

\begin{question}[Classification Calibrated]
    A loss $l(y \hat y)$ is classification calibrated, iff $\forall \boldsymbol{x}$,
    \begin{equation*}
    \hat y(\boldsymbol{x}) \coloneq \argmin_{\hat y \in \mathbb{R}} \mathbb{E}_{Y | X = \boldsymbol{x}} \; l(Y \hat y)
    \end{equation*}
    has the same sign as the Bayes rule $\eta(\boldsymbol{x}) \coloneq \argmin_{\hat y \in \mathbb{R}} \mathbb{E}_{Y|X=\boldsymbol{x}} \; l_{0-1}(Y \hat y)$

    Notice: $\eta(\boldsymbol{x}), \hat y (\boldsymbol{x})$ provide the \textbf{score}, their sign provides the prediction.
\end{question}

\begin{question}[Characterization under Convexity]
    Any \textbf{convex} loss $l$ is classification calibrated iff $l$ is differentiable
    at $0$ and $l'(0) < 0$.
\end{question}

\begin{question}[Hinge Loss]
    \begin{equation}
        l_{hinge}(y \hat y) = (1 - y \hat y)^+ \coloneq \max \{0, 1 - y \hat y\}
        = \begin{cases}
            1 - y \hat y, & \text{if } y \hat y < 1 \\
            0, & \text{otherwise}
        \end{cases}
    \end{equation}
    The classifier that minimizes the expected hinge loss minimizes the expected 0-1 loss.
\end{question}

\begin{question}[Soft-Margin SVM]
    \begin{equation}
        \min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 + C \cdot \sum_i l_{hinge}(y_i \hat y_i), \quad \text{s.t. } \hat y_i = \iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b
    \end{equation}
\end{question}
    

\begin{question}[Lagrangian Dual]
    Apply $C \cdot l_{hinge}(t) \coloneq \max\{0, C(1 - t)\} = \max_{0 \leq \alpha \leq C} \alpha (1 - t)$
    \begin{equation}
        \min_{\boldsymbol{w}, b} \max_{0 \leq \boldsymbol{\alpha} \leq C} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 + \sum_i \alpha_i (1 - y_i \hat y_i)
    \end{equation}

    Swap $\min$ with $\max$:
    \begin{equation}
        \max_{0 \leq \boldsymbol{\alpha} \leq C} \min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 + \sum_i \alpha_i (1 - y_i \hat y_i)
    \end{equation}

    Solve it by setting derivative to 0:
    \begin{equation} 
        \frac{\delta}{\delta \boldsymbol{w}} = \boldsymbol{w} - \sum_i \alpha_i y_i \boldsymbol{x_i} = 0,
        \qquad
        \frac{\delta}{\delta b} = - \sum_i \alpha_i y_i = 0,
    \end{equation}

    Plug them into the loss:
    \begin{equation}
        \begin{split}
            \max_{0 \leq \boldsymbol{\alpha} \leq C} \min_{\boldsymbol{w}, b} \frac{1}{2} \llnorm{\boldsymbol{w}}^2 + \sum_i \alpha_i (1 - y_i \hat y_i)
            & = \max_{0 \leq \boldsymbol{\alpha} \leq C} \min_{\boldsymbol{w}, b} \; \frac{1}{2} \llnorm{\boldsymbol{w}}^2 + \sum_i \alpha_i \left[ 1 - y_i (\iprod{\boldsymbol{x_i}, \boldsymbol{w}} + b) \right] \\
            & = \max_{0 \leq \boldsymbol{\alpha} \leq C} \sum_i \alpha_i - \frac{1}{2} \llnorm{\sum_i \alpha_i y_i \boldsymbol{x_i}}^2,
            \qquad \text{s.t. } \sum_i \alpha_i y_i = 0
        \end{split}
    \end{equation}

    Max $\rightarrow$ min and expand the norm:
    \begin{equation}
        \min_{0 \leq \boldsymbol{\alpha} \leq C} - \sum_i \alpha_i + \frac{1}{2} \sum_i \sum_j \alpha_i \alpha_j y_i y_j \underbrace{\iprod{\boldsymbol{x_i}, \boldsymbol{x_j}}}_{\text{Kernel, closed form w.r.t. }\boldsymbol{x_i}, \boldsymbol{x_j}}, 
        \qquad \text{s.t. } \sum_i \alpha_i y_i = 0
    \end{equation}

    $C \rightarrow \infty \Rightarrow \text{Hard-margin SVM}$,
    $C \rightarrow 0 \Rightarrow \text{a constant classifier}$
\end{question}

\section{Reproducing Kernels}

\begin{question}[(Reproducing) Kernels]
    $k: \mathcal(X) \times \mathcal{X} \rightarrow \mathbb{R}$ is a (reproducing) kernel
    iff there exists some $\Phi: \mathcal{X} \rightarrow \mathcal{H}$ so that
    $\iprod{\Phi(\boldsymbol{x}), \Phi(\boldsymbol{z})} = k(\boldsymbol{x}, \boldsymbol{z})$.
    
    \begin{itemize}
        \item A feature transform $\Phi$ determines the corresponding kernel $k$.
        \item A kernel $k$ determines some feature transforms $\Phi$, but may not be unique. \\
        E.g. $\iprod{\phi(\boldsymbol{x}), \phi(\boldsymbol{z})} = \iprod{\phi'(\boldsymbol{x}), \phi'(\boldsymbol{z})}$
        \begin{enumerate}
            \item $\phi(x) \coloneq [x_1^2, \sqrt{2} x_1 x_2] \in \mathbb{R}^2$
            \item $\phi'(x) \coloneq [x_1^2, x_1 x_2, x_1 x_2] \in \mathbb{R}^3$
        \end{enumerate}
    \end{itemize}
\end{question}

\begin{question}[Mercer's Theorem]
    $k: \mathcal{X} \times \mathcal{X} \rightarrow \mathbb{R}$ is a kernel, \\
    iff $\forall n \in \mathbb{N}, \forall \boldsymbol{x_1}, \dots, \boldsymbol{x_n} \in \mathcal{X}$,
    the kernel matrix $K$ such that $K_{ij} \coloneq k(\boldsymbol{x_i}, \boldsymbol{x_j})$ is symmetric and positive semi-definite (PSD).

    \begin{equation}
        k \text{ is a kernel} \Leftrightarrow \begin{cases}
            K_{ij} = K_{ji} & \text{(symmetric)}\\
            \iprod{\boldsymbol{\alpha}, K \boldsymbol{\alpha}} = \sum_{i=1}^n \sum_{j=1}^n \alpha_i \alpha_j K_{ij} \geq 0 \qquad \forall \boldsymbol{\alpha} \in \mathbb{R}^n & \text{(PSD)}
        \end{cases}
    \end{equation}
\end{question}

\begin{question}[Symmetric PSD]
    For a symmetric matrix $A$, the following conditions are equivalent.
    \begin{enumerate}
        \item $A \succeq 0$
        \item $A = U^T U$ for some matrix $U$
        \item $x^T A x \geq 0$ for every $x \in \mathbb{R}^n$
        \item All principal minors of $A$ are nonnegative
    \end{enumerate}
\end{question}

\section{Gradient Descent}

\begin{question}[Gradient Descent Template]
    Choose initial point $x^{(0)} \in \mathbb{R}^d$ and repeat:
    \begin{equation}
        x^{(k)} = x^{(k-1)} - \underbrace{\eta}_{\text{step size}} \nabla f(x^{(k-1)}), \quad k = 1, 2, \dots
    \end{equation}
\end{question}

\begin{question}[Interpretation from Taylor Expansion]
    Expand $f$ locally at $x$:
    \begin{equation}
        \begin{split}
            f(y) & \approx f(x) + \nabla f(x)^T(y-x) + \frac{1}{2t}\llnorm{y-x}^2 \\
            \Rightarrow \min_y f(y) & \approx \min_y \left[ f(x) + \nabla f(x)^T(y-x) + \frac{1}{2t}\llnorm{y-x}^2 \right] \\
        \end{split}
    \end{equation}
    When $y - x = \frac{\nabla f(x)}{-2 \frac{1}{2t}} = -t \nabla f(x) \Longrightarrow y = x - t \nabla f(x)$, it reaches the minimum.
\end{question}

\begin{question}[$L$-smooth or $L$-Lipschitz Continuous]
    $f$ is convex and differentiable. $\nabla f$ is $L$-Lipschitz continuous ($L$-smooth):
    \begin{equation}
        L \boldsymbol{I}  \succeq \nabla^2 f(x), \forall x
    \end{equation}
\end{question}

\begin{question}[Convergence Rate for Convex Case]
    Assume $f$ is $L$-smooth.
    Gradient descent with fixed step size $t \leq \frac{1}{L}$ satisfies:
    \begin{equation}
        f(x^{(k)}) - f(x^*) \leq \frac{\llnorm{x^{(0)} - x^*}^2}{2tk}
    \end{equation}
    We say gradient descent has convergence rate $O(\frac{1}{k})$,
    i.e. $f(x^{(k)}) - f(x^*) \leq \epsilon$ can be achieved using only $O(\frac{1}{\epsilon})$ iterations.

    \vspace{5mm}
    \textbf{Proof}

    \begin{equation}
        \begin{split}
            f(y) 
            & = f(x) + \nabla f(x)^T (y-x) + \frac{1}{2} (y-x)^T \nabla^2 f(\xi)(y-x) \\
            & \leq f(x) + \nabla f(x)^T(y-x) + \frac{1}{2} L \llnorm{y-x}^2 \qquad (L\text{-smooth}, L \boldsymbol{I} \succeq \nabla^2 f(\xi))
        \end{split}
    \end{equation}

    Plug in gradient descent:
    \begin{equation} \label{convergence-1}
        \begin{split}
            f(x^+) 
            & = f(y) \\
            & \leq f(x) + \nabla f(x)^T(x - t\nabla f(x)-x) + \frac{1}{2} L \llnorm{x - t\nabla f(x)-x}^2 \\
            & = f(x) -(1 - \frac{1}{2}Lt) t \llnorm{\nabla f(x)}^2 \\
            & \leq f(x) - \frac{1}{2} t \llnorm{\nabla f(x)}^2 \quad (t \leq \frac{1}{L})
        \end{split}
    \end{equation}

    $f$ is convex $\Rightarrow f(x^*) \geq f(x) + \nabla f(X)^T(x^* - x) \Rightarrow f(x) \leq f(x^*) + \nabla f(x)^T(x - x^*)$

    Plug this into \ref{convergence-1}:
    \begin{equation}
        \begin{split}
            & f(x^+) \leq f(x^*) + \nabla f(x)^T (x - x^*) - \frac{1}{2} t \llnorm{\nabla f(x)}^2 \\
            \Rightarrow & f(x^+) - f(x^*) \leq \frac{1}{2t}\left( 2t \nabla f(x)^T(x - x^*) - t^2 \llnorm{\nabla f(x)}^2 \right) \\
            \Rightarrow & f(x^+) - f(x^*) \leq \frac{1}{2t}\left( 2t \nabla f(x)^T(x - x^*) - t^2 \llnorm{\nabla f(x)}^2 - \llnorm{x-x^*}^2 + \llnorm{x-x^*}^2 \right) \\
            \Rightarrow & f(x^+) - f(x^*) \leq \frac{1}{2t}\left( \llnorm{x-x^*}^2 - (\llnorm{x-x^*}^2 + t^2 \llnorm{\nabla f(x)}^2 - 2t \nabla f(x)^T(x - x^*) ) \right) \\
            \Rightarrow & f(x^+) - f(x^*) \leq \frac{1}{2t}\left( \llnorm{x-x^*}^2 - \llnorm{x - x^* - t\nabla f(x)}^2 \right) \\
            \Rightarrow & f(x^+) - f(x^*) \leq \frac{1}{2t}\left( \llnorm{x-x^*}^2 - \llnorm{x^+ - x^*}^2 \right) \\
        \end{split}
    \end{equation}

    Viewing $x^+$ as $x^{(i)}$ and $x$ as $x^{(x-1)}$:
    \begin{equation}
        \begin{split}
            \sum_{i=1}^k \paren{ f(x^{(i)}) - f(x^*) }
            & \leq \sum_{i=1}^k \frac{1}{2t} \paren{ \llnorm{x^{(i-1)} - x^*}^2 - \llnorm{x^{(i)} - x^*}^2 } \\
            & = \frac{1}{2t} \paren{ \llnorm{x^{(0)} - x^*}^2 - \llnorm{x^{(k)} - x^*}^2 } \\
            & \leq \frac{1}{2t} \llnorm{x^{(0)} - x^*}^2 \\
        \end{split}
    \end{equation}

    which implies
    \begin{equation}
        f(x^{(k)}) \leq \frac{1}{k} \sum_{i=1}^k f(x^{(i)}) \leq f(x^*) + \frac{\llnorm{x^{(0)} - x^*}^2}{2tk}
    \end{equation}
\end{question}

\begin{question}[Convergence Rate for Strong Convexity]
    $f$ is differentiable, $L$-smooth, and $m$-strongly convex.

    $m$-strong convexity of $f$ means $f(x) - \frac{m}{2} \llnorm{x}^2$ is convex, i.e. $\nabla^2 f(x) \succeq m \boldsymbol{I}$

    Then, there is a constant $0 < \gamma < 1$ such that gradient descent with fixed step size $t \leq \frac{2}{m + L}$ satisfies:
    \begin{equation}
        f(x^{(k)}) - f(x^*) \leq \gamma^k \frac{L}{2} \llnorm{x^{(0)} - x^*}^2
    \end{equation}
    Rate is $O(\gamma^k)$. Only $O(\log_{\frac{1}{\gamma}}(\frac{1}{\epsilon}))$ iterations needed.
\end{question}

\begin{question}[Convergence Rate for Non-Convex Case]
    $f$ is differentiable and $L$-smooth, but non-convex.

    Gradient descent with fixed step size $t \leq \frac{1}{L}$ satisifes:
    \begin{equation}
        \min_{i=0,\dots,k} \llnorm{\nabla f(x^{(i)})} \leq \sqrt(\frac{2 (f(x^{(0)} - f^*))}{t(k+1)})
    \end{equation}

    Rate is $O(\frac{1}{\sqrt{k}})$ for finding stationary point.
    $O(\frac{1}{\epsilon^2})$ iterations are needed.
\end{question}

\begin{question}[Convergence Rate for Stochastic Gradient Descent]
    For convex and $L$-smooth $f$,

    \begin{itemize}
        \item Gradient Descent
        \begin{equation}
            \boldsymbol{w^+} = \boldsymbol{w} - t \cdot \frac{1}{n} \sum_{i=1}^n \nabla f_i(\boldsymbol{w})
        \end{equation}
            \begin{itemize}
                \item Step size $t \leq \frac{1}{L}$
                \item Time complexity $O(\frac{n}{\epsilon})$
            \end{itemize}
        \item Stochastic Gradient Descent
        \begin{equation}
            \boldsymbol{w^+} = \boldsymbol{w} - t \cdot \nabla f_{I_{random}}(\boldsymbol{w})
        \end{equation}
            \begin{itemize}
                \item Step size $t = \frac{1}{k}, k = 1, 2, 3, \dots$ (adaptive step size)
                \item Time complexity $O(\frac{1}{\epsilon^2})$
            \end{itemize}
    \end{itemize}
\end{question}

\section{Fully-Connected Neural Networks}

\begin{question}[Forward and Backward Pass of a 2-Layer MLP]
    A 2-layer MLP ($k$ is the NN width, $c$ is the output dim):
    \begin{align}
        \boldsymbol{x} & = \text{input} & (\boldsymbol{x} \in \mathbb{R}^d) \\
        \boldsymbol{z} & = \boldsymbol{W} \boldsymbol{x} + \boldsymbol{b_1} & (\boldsymbol{W} \in \mathbb{R}^{k \times d}, \boldsymbol{z}, \boldsymbol{b} \in \mathbb{R}^k) \\
        \boldsymbol{h} & = \text{ReLU}(\boldsymbol{z}) & (\boldsymbol{h} \in \mathbb{R}^k) \\
        \boldsymbol{\theta} & = \boldsymbol{U} \boldsymbol{h} + \boldsymbol{b_2} & (\boldsymbol{U} \in \mathbb{R}^{c \times k}, \boldsymbol{\theta}, \boldsymbol{b_2} \in \mathbb{R}^c) \\
        J & = \frac{1}{2} \llnorm{\boldsymbol{\theta} - \boldsymbol{y}}^2 & (\boldsymbol{y} \in \mathbb{R}^c, J \in \mathbb{R})
    \end{align}
    \begin{align}
        \text{ReLU} = \begin{cases}
            x & x > 0 \\
            0 & x \leq 0
        \end{cases} \\
        \text{ReLU}' = \begin{cases}
            1 & x > 0 \\
            0 & x \leq 0
        \end{cases}
    \end{align}
    
    Backward pass ($\odot$ is the Hadamard product, i.e. element-wise product):
    \begin{align}
        \frac{\delta J}{\delta \boldsymbol{\theta}} & = \boldsymbol{\theta} - \boldsymbol{y} \\
        \frac{\delta J}{\delta \boldsymbol{U}} & = \frac{\delta J}{\delta \boldsymbol{\theta}} \circ \frac{\delta \boldsymbol{\theta}}{\delta \boldsymbol{U}} = (\boldsymbol{\theta} - \boldsymbol{y}) \boldsymbol{h}^T \\
        \frac{\delta J}{\delta \boldsymbol{b_2}} & = \frac{\delta J}{\delta \boldsymbol{\theta}} \circ \frac{\delta \boldsymbol{\theta}}{\delta \boldsymbol{b_2}} = \boldsymbol{\theta} - \boldsymbol{y} \\
        \frac{\delta J}{\delta \boldsymbol{h}} & = \frac{\delta J}{\delta \boldsymbol{\theta}} \circ \frac{\delta \boldsymbol{\theta}}{\delta \boldsymbol{h}} = \boldsymbol{U^T}(\boldsymbol{\theta} - \boldsymbol{y}) \\
        \frac{\delta J}{\delta \boldsymbol{z}} & = \frac{\delta J}{\delta \boldsymbol{h}} \circ \frac{\delta \boldsymbol{h}}{\delta \boldsymbol{z}} = \boldsymbol{U^T}(\boldsymbol{\theta} - \boldsymbol{y}) \odot \text{ReLU}'(\boldsymbol{z}) \\
        \frac{\delta J}{\delta \boldsymbol{W}} & = \frac{\delta J}{\delta \boldsymbol{z}} \circ \frac{\delta \boldsymbol{z}}{\delta \boldsymbol{W}} = \boldsymbol{U^T}(\boldsymbol{\theta} - \boldsymbol{y}) \odot \text{ReLU}'(\boldsymbol{z}) \boldsymbol{x}^T \\
        \frac{\delta J}{\delta \boldsymbol{b_1}} & = \frac{\delta J}{\delta \boldsymbol{z}} \circ \frac{\delta \boldsymbol{z}}{\delta \boldsymbol{b_1}} = \boldsymbol{U^T}(\boldsymbol{\theta} - \boldsymbol{y}) \odot \text{ReLU}'(\boldsymbol{z}) \\
    \end{align}
\end{question}

\begin{question}[Universal Approximation Theorem by 2-Layer NNs]
    For any continuous function $f: \mathbb{R}^d \rightarrow \mathbb{R}^c$
    and any $\epsilon > 0$,
    there exists $k \in \mathbb{N}, \boldsymbol{W} \in \mathbb{R}^{k \times d}, \boldsymbol{b} \in \mathbb{R}^k, \boldsymbol{U} \in \mathbb{R}^{c \times k}$
    such that
    \begin{equation}
        \sup_{\boldsymbol{x}} \llnorm{f(\boldsymbol{x}) - g(\boldsymbol{x})} < \epsilon
    \end{equation}
    where $g(\boldsymbol{x}) = \boldsymbol{U}(\sigma(\boldsymbol{W}\boldsymbol{x} + \boldsymbol{b}))$
    and $\sigma$ is the element-wise ReLU operation.

    As long as the 2-layer MLP is wide enough, it can approximate any
    continuous function arbitrarily closely.
\end{question}

\section{Convolutional Neural Networks}

\begin{question}[Controlling the Convolution]
    Hyperparameters.

    \begin{itemize}
        \item \textbf{Filter (kernel) size}: width $times$ height.
        \item \textbf{Number of filters (kernels)}. \\
        Weights are not shared between different filters (kernels)
        \item \textbf{Stride}: how many pixels the filter moves each time.
        \item \textbf{Padding}: add zeros around the boundary of the input.
    \end{itemize}
\end{question}

\begin{question}[Size Calculation]
    \begin{align*}
        \text{Input size: } & m \times n \times c_{in} \\
        \text{Filter size: } & a \times b \times c_{in} \\
        \text{Stride: } & s \times t \\
        \text{Padding: } & p \times q
    \end{align*}

    Output size:
    \begin{equation}
        \floor*{1 + \frac{m + 2p - a}{s}} \times \floor*{1 + \frac{n + 2q - b}{t}}
    \end{equation}
\end{question}

\part{For Final}

\section{Transformer}

\begin{question}[Attention Layer Inputs and Outputs]
    Inputs: $V \in \mathcal{R}^{n \times d}$, $K \in \mathcal{R}^{n \times d}$, $Q \in \mathcal{R}^{m \times d}$,
    Outputs: an $m \times d$ matrix.

    \begin{itemize}
        \item \textbf{Self Attention}: $m = n$,
        \item \textbf{Cross Attention}: $m \neq n$ where $m$ is the sequence length of decoder, $n$ is the sequence length of encoder.
    \end{itemize}
\end{question}

\begin{question}[Attention Layer Calculation]
    \begin{equation}
        \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d}}\right)V
    \end{equation}

    Softmax is row-wise, i.e. for each row of $QK^T$, it is normalized to sum to 1.
\end{question}

\begin{question}[Learnable Attention Layer]
    \begin{equation}
        \text{Attention}(XW^v, XW^k, XW^q) = \text{softmax}\left(\frac{XW^q(XW^k)^T}{\sqrt{d}}\right)XW^v
    \end{equation}
\end{question}

\begin{question}[RMSNorm (LLaMA's Choice)]
    \begin{equation}
        \bar{a_i} = \frac{a_i}{\text{RMS}(a)}\gamma = \frac{a_i}{\sqrt{\frac{1}{d}\sum_{j=1}^d a_j^2}}\gamma
    \end{equation}
\end{question}

\begin{question}[Transformer Loss]
    \begin{equation}
        \min_{W} \hat{\mathbb{E}}\left[ - \iprod{Y, \log \hat{Y}} \right]
    \end{equation}
    $Y$ is output sequence, one-hot; \\
    $\hat{Y}$ is the predicted probabilities
\end{question}

\begin{question}[Transformer Implementation] As following.
    \begin{lstlisting}[language=python]
import torch
import torch.nn as nn
import torch.F as F
import math

class RMSNorm(nn.Module):
  def __init__(self, hidden_dim, eps = 1e-6):
    super().__init__()
    self.eps = eps
    self.weight = nn.Parameter(torch.ones(hidden_dim))

  def forward(self, hidden_state):
    norm = hidden_state.pow(2).mean(-1, keepdim = True)
    output = hidden_state * self.weight * torch.rsqrt(norm + self.eps)
    return output

class MultiHeadAttention(nn.Module):
  def __init__(self, hidden_dim, num_heads):
    super().__init__()
    self.hidden_dim = hidden_dim
    self.num_heads = num_heads
    self.head_dim = hidden_dim // num_heads
    self.q_linear = nn.linear(hidden_dim, hidden_dim)
    self.k_linear = nn.linear(hidden_dim, hidden_dim)
    self.v_linear = nn.linear(hidden_dim, hidden_dim)
    self.o_linear = nn.linear(hidden_dim, hidden_dim)
    self.norm = RMSNorm(hidden_dim)

  def forward(self, hidden_state, mask, past_kv = None, use_cache = True):
    bs = hidden_state.shape[0]

    residual = hidden_state

    hidden_state = self.norm(hidden_state) # LLAMA style normalization

    q = self.q_linear(hidden_state) # (bs, seqlen, hidden_dim)
    k = self.k_linear(hidden_state) # (bs, seqlen, hidden_dim)
    v = self.v_linear(hidden_state) # (bs, seqlen, hidden_dim)

    q = q.view(bs, -1, self.num_heads, self.head_dim).tranpose(1, 2)
    k = k.view(bs, -1, self.num_heads, self.head_dim).tranpose(1, 2)
    v = v.view(bs, -1, self.num_heads, self.head_dim).tranpose(1, 2)
    # (bs, nums_head, seqlen, head_dim)

    q, k = apply_rope(q, k)

    # kv cache
    if past_kv is not None:
      past_k, past_v = past_kv
      k = torch.cat([past_k, k], dim = 2)
      v = torch.cat([past_v, v], dim = 2)
    new_past_kv = (k, v) if use_cache else None

    # compute attention
    attention_scores = torch.matmul(q, k.tranpose(-1, -2)) / math.sqrt(self.head_dim)
    attention_scores += mask * -1e9
    attention_scores = F.softmax(attention_scores, dim = -1)
    output = torch.matmul(attention_scores, v)

    # concat
    output = output.tranpose(1, 2).contiguous().view(bs, -1, self.hidden_dim)

    # o_linear
    output = self.o_linear(output)

    output += residual

    return output, new_past_kv if use_cache else output
    \end{lstlisting} 
\end{question}

\section{Large Language Models}

\begin{question}[BERT v.s. GPT]
    BERT is encoder; GPT is decoder.

    \begin{itemize}
        \item BERT predicts middle words; GPT predicts the next word.
        \item BERT is \textbf{NOT} auto-regressive; GPT is auto-regressive.
    \end{itemize}
\end{question}

\begin{question}[GPT -- Generative Pre-Training]
    \begin{equation}
        \min_{\Theta} \hat{\mathbb{E}} - \log \prod_{j=1}^m \Pr(x_j \vert x_1, \dots, x_{j-1}; \Theta)
    \end{equation}
\end{question}

\begin{question}[Fine-Tuning Tasks]
    Supervised fine-tuning tasks:
    \begin{equation}
        \min_{\Theta} \underbrace{- \hat{\mathbb{E}} \log \Pr(y \vert X_{1:m}; \Theta)}_{\text{task-aware supervised loss}} \underbrace{- \lambda \hat{\mathbb{E}} \log \prod_{j=1}^m \Pr(x_j \vert X_{1:j-1}; \Theta)}_{\text{pretraining loss}}
    \end{equation}
\end{question}

\begin{question}[BERT $\rightarrow$ RoBERTa]
    Training longer, with bigger batches, over more data and longer sequence.

    Removing the next sentence prediction objective.
\end{question}

\begin{question}[Sentence-BERT]
    a twin network architecture that uses BERT to derive sentence embeddings.
\end{question}

\begin{question}[GPT-2]
    1.5B parameters.

    \begin{itemize}
        \item 10x larger than GPT-1
        \item Training method is same as GPT-1.
        \item Performs on par with BERT on fine-tuning tasks.
        \item Good at zero-shot learning.
        \item Open-source.
    \end{itemize}
\end{question}

\begin{question}[GPT-3]
    175B parameters.

    \begin{itemize}
        \item 100x larger than GPT-2.
        \item Training method is same as GPT-2.
        \item New phenomenon: \textbf{in-context learning} (ICL, or few-shot learning) and \textbf{chain-of-thoughts} (CoT).
    \end{itemize}
\end{question}

\begin{question}[GPT-3.5 -- RLHF]
    Reinforcement Learning from Human Feedback (RLHF).
    \begin{itemize}
        \item state = prompt
        \item action = model output
        \item policy function = LLM
        \item reward = levels of matching human feedback
    \end{itemize}

    Pari-wise comparison loss to train reward model $r_\theta$:
    \begin{equation}
        \mathcal{L}_{\text{pair}}(\theta) = - \frac{1}{\binom{K}{2}} \mathbb{E}_{(x, y_w, y_l)} \left[ \log ( \sigma(r_\theta(x, y_w) - r_\theta(x, y_l)) ) \right]
    \end{equation}

    Proximal Policy Optimization (PPO) to maximize objective:
    \begin{equation}
        \max_{\Phi} \mathbb{E}_{(x, y)} \left[ \underbrace{r_\theta(x, y)}_{\text{maximize reward}} - \beta \underbrace{\log\left( \frac{\pi_\Phi^{\text{RL}}(y \vert x)}{\pi^{\text{SFT}}(y \vert x)}\right) }_{\text{model is close to SFT model}} + \gamma \underbrace{\mathbb{E}[\log(\pi_\Phi^{\text{RL}}(x))]}_{\text{pretraining loss}} \right]
    \end{equation}
\end{question}

\section{Speculative Sampling}

\begin{question}[Reject Sampling for Check]
    Check in parallel.

    \begin{itemize}
        \item $r \sim U(0, 1), \quad \text{if } r < \underbrace{\min\left(1, \frac{p(t)}{q(t)}\right)}_{\text{accept rate}}$, next token $= t$.
        \item else: next token $= t' \sim \underbrace{\text{norm}(\max(0, p-q))}_{\text{residual distribution}}$.
    \end{itemize}
\end{question}

\begin{question}[Proof: Reject Sampling $\equiv t \sim p$]
    \begin{align}
        \begin{split}
            \min(p(t), q(t)) + \max(0, p(t) - q(t)) 
            &= \begin{cases}
            p(t) + 0 & \text{if } p(t) < q(t) \\
            q(t) + p(t) - q(t) & \text{if } p(t) \geq q(t)
            \end{cases} \\
            &= p(t)
        \end{split} \\
        \Rightarrow & \sum_t (\min(p(t), q(t)) + \max(0, p(t) - q(t))) = \sum_t p(t) = 1 \\
        \Rightarrow & 1 - \sum_t \min(p(t), q(t)) = \sum_t \max(0, p(t) - q(t))
    \end{align}

    \begin{equation}
        \begin{split}
            \Pr(X = t) &= \Pr(\tilde{X} = t) \Pr(\tilde{X}\text{ accept} \vert \tilde{X} = t) + \Pr(\tilde{X}\text{ reject}) \Pr(\tilde{\tilde{X}} = t \vert \tilde{X}\text{ reject}) \\
            &= q(t) \cdot \min\left(1, \frac{p(t)}{q(t)}\right) + (1 - \Pr(\tilde{X}\text{ accept})) \cdot \text{norm}(\max(0, p(t) - q(t))) \\
            &= \min\left(q(t), p(t)\right) + (1 - \sum_t \min\left(p(t), q(t)\right)) \cdot \frac{\max(0, p(t) - q(t))}{\sum_t \max(0, p(t) - q(t))} \\
            &= \min\left(q(t), p(t)\right) + \max(0, p(t) - q(t)) \\
            &= p(t)
        \end{split}
    \end{equation}
\end{question}

\section{Generative Adversarial Networks}

\begin{question}[Representation through Push-Forward]
    Let $r$ be any continuous distribution on $\mathbb{R}^h$.
    For any distribution $p$ on $\mathbb{R}^d$, 
    there exists push-forward maps $G: \mathbb{R}^h \rightarrow \mathbb{R}^d$ such that
    \begin{equation}
        \boldsymbol{z} \sim r \Longrightarrow G(\boldsymbol{z}) \sim p
    \end{equation}
\end{question}

\begin{question}[Discriminator's Goal]
    For a fixed generator $G$, minimize a log loss over $D$:
    \begin{itemize}
        \item If $x$ is real, minimize $-\log D(x)$;
        \item If $x$ is fake, minimize $-\log(1 - D(x))$.
    \end{itemize}

    \begin{equation}
        \min_D -\frac{1}{2} \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D(x)\right] - \frac{1}{2} \mathbb{E}_{z \sim r} \left[\log(1 - D(G(z)))\right]
    \end{equation}
\end{question}

\begin{question}[Generator's Goal]
    For a fixed discriminator $D$, maximize a log loss over $G$:
    
    \begin{equation}
        \begingroup\color{red}{\max_G}\endgroup -\frac{1}{2} \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D(x)\right] - \frac{1}{2} \mathbb{E}_{z \sim r} \left[\log(1 - D(G(z)))\right]
    \end{equation}
\end{question}

\begin{question}[Solver]
    \begin{equation}
        \min_G \max_D V(G, D) = \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D(x)\right] + \mathbb{E}_{z \sim r} \left[\log(1 - D(G(z)))\right]
    \end{equation}

    Solved by alternative minimization-maximization:
    \begin{itemize}
        \item G step: Fix $D$ and update $G$ by one-step gradient descent
        \item D step: Fix $G$ and update $D$ by one-step gradient descent
        \item Repeat until the algorithm reaches an approximate equilibrium
    \end{itemize}
    
\end{question}

\begin{question}[Solution of $D^*$]
    Let $p_g(x)$ be the density of $x$ estimated by the generator $G$.
    For $G$ fixed, the optimal discriminator $D$ is $D_G^*(x) = \frac{p_{\text{data}}(x)}{p_{\text{data}}(x) + p_g(x)}$

    \textbf{Proof:}

    \begin{equation}
        \begin{split}
            V(G, D) &\coloneq \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D(x)\right] + \mathbb{E}_{z \sim r} \left[\log(1 - D(G(z)))\right] \\
            &= \int \log D(x) p_{\text{data}}(x) dx + \int_z \log(1 - D(G(z))) p_z(z) dz \\
            &= \int \underbrace{\log D(x) p_{\text{data}}(x) + p_g(x) \log(1 - D(x))}_{f(D(x))} dx
        \end{split}
    \end{equation}

    For any fixed $x$, taking derivative $= 0$:
    \begin{equation}
        \begin{split}
            f'(D(x)) &= \frac{p_{\text{data}}(x)}{D(x)} - \frac{p_g(x)}{1 - D(x)} = 0 \\
            D_G^*(x) &= \frac{p_{\text{data}}(x)}{p_{\text{data}}(x) + p_g(x)}
        \end{split}
    \end{equation}
\end{question}

\begin{question}[Solution of $G^*$]
    The global minimum of $\min_G \max_D V(G, D)$ is achieved if
    and only if $p_g = p_{\text{data}}$. The optimal objective value is $-\log4$.

    \textbf{Proof:}
    \begin{equation}
        \begin{split}
        V(G, D_G^*) &= \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D_G^*(x)\right] + \mathbb{E}_{z \sim r} \left[\log(1 - D_G^*(G(z)))\right] \\
        &= \mathbb{E}_{x \sim p_{\text{data}}} \left[\log D_G^*(x)\right] + \mathbb{E}_{x \sim p_g} \left[\log(1 - D_G^*(x))\right] \\
        &= \mathbb{E}_{x \sim p_{\text{data}}} \left[\log \frac{p_{\text{data}}(x)}{p_{\text{data}}(x) + p_g(x)}\right] + \mathbb{E}_{x \sim p_g} \left[\frac{p_g(x)}{p_{\text{data}}(x) + p_g(x)}\right] \\
        \end{split}
    \end{equation}

    By definition of KL divergence $\text{KL}(P \| Q) = \mathbb{E}_{x \sim P} \left[\log \frac{p(x)}{q(x)}\right]$, we have:
    \begin{equation}
    \begin{split}
    V(G, D_G^*) &= \mathbb{E}_{x \sim p_{\text{data}}} \left[\log \frac{p_{\text{data}}(x)}{p_{\text{data}}(x) + p_g(x)}\right] + \mathbb{E}_{x \sim p_g} \left[\log \frac{p_g(x)}{p_{\text{data}}(x) + p_g(x)}\right] \\
    &= -\log4 + \text{KL}\left(p_{\text{data}} \| \frac{p_\text{data} + p_g}{2}\right) + \text{KL}\left(p_g \| \frac{p_{\text{data}} + p_g}{2}\right) \\
    &= -\log4 + 2 \cdot \text{JSD}(p_{\text{data}} \| p_g) \\
    & \geq -\log4
    \end{split}
    \end{equation}

    The equality holds if and only if $p_{\text{data}} = p_g$.
\end{question}

\section{Adversarial Attacks}

\begin{question}[Principle of Generating Adversarial Attacks]
    \begin{equation}
        \max_{\left\|x_{\text{adv}}-x\right\|_{\infty} \leq \epsilon} \mathcal{L}(C(x_{\text{adv}}), y)
    \end{equation}

    where $C$ is the composition of $h$ and $f$.
\end{question}

\begin{question}[Different Solvers] to optimize the adversarial attack.
    \begin{itemize}
        \item \textbf{Zero-Order Solvers} (only access to the output of NN)
        \begin{itemize}
            \item Black-box attack
        \end{itemize}
        \item \textbf{First-Order Solvers} (access to the gradient of NN)
        \begin{itemize}
            \item White-box attack
            \item Fast Gradient Sign Method (FGSM), BIM, PGD, CW attack, \dots
        \end{itemize}
        \item \textbf{Second-Order Solvers} (access to the Hessian matrix)
        \begin{itemize}
            \item White-box attack
            \item L-BFGS attack
        \end{itemize}
    \end{itemize}
\end{question}

\begin{question}[Holder Inequality]
    For any $p, q \geq 1$ such that $\frac{1}{p} + \frac{1}{q} = 1$,
    \begin{equation} \label{eq:holder-inequality}
        \left\|x\right\|_p \cdot \left\|y\right\|_q \geq \left\langle x, y \right\rangle
    \end{equation}
    where $\left\langle x, y \right\rangle$ is the inner product.

    $\| \cdot \|_p$ and $\| \cdot \|_q$ are also known as dual norms.
    \begin{itemize}
        \item $\| \cdot \|_2$ is self-dual.
        \item $\| \cdot \|_\infty$ and $\| \cdot \|_1$ are dual norms.
    \end{itemize}
\end{question}

\begin{question}[FGSM -- Fast Gradient Sign Method]
    White-box and non-targeted (maximize the loss w.r.t. the true label).

    Do linear expansion at $x$:
    \begin{equation}
        \mathcal{L}(C(x + \delta), y) \approx \underbrace{\mathcal{L}(C(x), y)}_{\text{constant}} + \nabla_x \mathcal{L}(C(x), y) \cdot \delta
    \end{equation}

    The problem reduces to:
    \begin{equation}
        \max_{\left\|\delta\right\|_{\infty} \leq \epsilon} \nabla_x \mathcal{L}(C(x), y) \cdot \delta
    \end{equation}

    Because of holder inequality \eqref{eq:holder-inequality}, we have:
    \begin{equation}
        \nabla_x \mathcal{L}(C(x), y) \cdot \delta \leq \left\|\delta\right\|_{\infty} \cdot \left\|\nabla_x \mathcal{L}(C(x), y)\right\|_1
        \leq \epsilon \cdot \left\|\nabla_x \mathcal{L}(C(x), y)\right\|_1
    \end{equation}

    Thus, the adversarial example is generated by:

    \begin{equation}
        x_{\text{adv}} = x + \epsilon \cdot \text{sign}(\nabla_x \mathcal{L}(C(x), y))
    \end{equation}

    where $\epsilon$ is the perturbation size.
\end{question}

\begin{question}[BIM -- Basic Iterative Method]
    BIM is an iterative version of FGSM.
    \begin{itemize}
        \item Initialize $x^{(0)} = x$.
        \item For $k = 1, 2, \dots, K$:
        \begin{equation}
            x^{(k)} = x^{(k-1)} + \gamma \cdot \text{sign}(\nabla_x \mathcal{L}(C(x^{(k-1)}), y))
        \end{equation}
    \end{itemize}

    Issues:
    \begin{itemize}
        \item By repeating, the pertubation size $\epsilon$ will become larger.
        \item For a pre-defined $\epsilon$, $x^{(k)}$ may not satisfy $\|x^{(k)} - x\|_\infty \leq \epsilon$.
    \end{itemize}
\end{question}

\begin{question}[PGD -- Projected Gradient Descent]
    To resolve the issue of BIM, PGD involves a truncation operation:
    \begin{itemize}
        \item Initialize $x^{(0)} = x + \delta, \text{ where } \delta \in (-\epsilon, \epsilon)$.
        \item For $k = 1, 2, \dots, K$:
        \begin{equation}
            x^{(k)} = \text{clip}_{(-\epsilon, \epsilon)}(x^{(k-1)} + \gamma \cdot \text{sign}(\nabla_x \mathcal{L}(C(x^{(k-1)}), y)))
        \end{equation}
    \end{itemize}

    where $\text{clip}_{(-\epsilon, \epsilon)}(x)$ projects $x$ back to the $\ell_\infty$ ball of radius $\epsilon$ around $x$.
\end{question}

\begin{question}[Targeted PGD Attack]
    Objective:

    \begin{itemize}
        \item \textbf{Untargeted}: \begin{equation}
            \max_{\left\|\delta\right\|_{\infty} \leq \epsilon} \mathcal{L}(C(x + \delta), y_{\text{true}})
        \end{equation}
        \item \textbf{Targeted}: \begin{equation}
            \begingroup\color{red}\min_{\left\|\delta\right\|_{\infty} \leq \epsilon}\endgroup \mathcal{L}(C(x + \delta), \begingroup\color{red}y_{\text{target}}\endgroup)
        \end{equation}
    \end{itemize}
\end{question}

\section{Adversarial Robustness}

\begin{question}[Defense Mechanisms]
    Categorized into two types:
    \begin{itemize}
        \item \textbf{Gradient Masking}: hide the gradients and make first-order attacks fail
        \begin{itemize}
            \item \textbf{Shasttered Gradients}
            By applying a non-smooth or non-differentiable preprocessor $g$ to the inputs,
            and then training a DNN model $f$ on the preprocessed inputs $g(x)$.

            \item \textbf{Stochastic/Randomized Gradients}
            Apply some form of randomization of the DNN model.
            E.g. train a set of classifiers and during the testing phase randomly select one classifier to predict the labels.
        \end{itemize}
        \item \textbf{Adversarial Training}: 
        \begin{equation}
            \min_{C} \mathbb{E}_{(x, y) \sim \mathcal{D}^n} \left[ \max_{\left\|\delta\right\|_{\infty} \leq \epsilon} \mathcal{L}(C(x + \delta), y) \right]
        \end{equation}
    \end{itemize}
\end{question}

\begin{question}[Trade-Off between Natural and Robust Error]
    \begin{equation}
    \min_f R_{\text{nat}}(f) + R_{\text{rob}}(f) / \lambda
    \end{equation}

    \begin{equation}
        \begin{split}
        R_{\text{nat}}(f) &\coloneq \Pr_{x,y \sim \mathcal{D}} \{f(x) y \leq 0\} \\
        &= \mathbb{E}_{(x, y) \sim \mathcal{D}} \left[ \mathbb{I}(f(x) y \leq 0) \right] \\
        \end{split}
    \end{equation}
    \begin{equation}
        \begin{split}
        R_{\text{rob}}(f) &\coloneq \Pr_{x,y \sim \mathcal{D}} \{\exists \delta \in B_\epsilon(x) \text{ s.t. } f(x + \delta) y \leq 0\} \\
        &= \mathbb{E}_{(x, y) \sim \mathcal{D}} \left[ \max_{\left\|\delta\right\|_{\infty} \leq \epsilon} \mathbb{I}(f(x + \delta) y \leq 0) \right] \\
        \end{split}
    \end{equation}

    Approximate by a differentiable surrogate loss $\Phi$:
    \begin{equation}
        R_{\text{nat}}(f) \approx \mathbb{E}_{(x, y) \sim \mathcal{D}} \left[ \Phi(f(x) y) \right]
    \end{equation}
\end{question}

\begin{question}[TRADES]
    \begin{equation}
        \min_f \underbrace{\left[ \underbrace{\mathbb{E}_{(x, y) \sim \mathcal{D}} \left[ \Phi(f(x) y) \right]}_{\text{minimize diff btw } f(x) \text{ and } y \text{ for accuracy}} + \underbrace{\mathbb{E}_{(x, y) \sim \mathcal{D}} \left[ \max_{\left\|\delta\right\|_{\infty} \leq \epsilon} \Phi(f(x + \delta) f(x)) \right]}_{\text{minimize diff btw }f(x) \text{ and } f(x+\delta){ for robustness}} / \lambda \right]}_{\text{TRADES Loss}}
    \end{equation}
\end{question}

\section{Self-Supervised Learning}

\begin{question}[Contrastive Learning]
    Loss:

    \begin{equation}
        \max_\Theta \Pr_1 = \frac{\exp(z_1)}{\sum_j \exp(z_j)}
    \end{equation}
\end{question}

\end{document}
Keyboard shortcuts

Hongxu Xu

CS 480/680: Introduction to Machine Learning