forked from ajtulloch/Elements-of-Statistical-Learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathESL-Chap4Solutions.tex
137 lines (120 loc) · 11.5 KB
/
ESL-Chap4Solutions.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
\chapter{Linear Methods for Classification}
\begin{exer}
Show how to solve the generalised eigenvalue problem $\max a^T B a$ subject to $a^T W a = 1$ by transforming it to a standard eigenvalue problem.
\end{exer}
\begin{proof}
By Lagrange multipliers, we have that the function $\mathcal{L}(a) = a^T B a - \lambda(a^T W a - 1)$ has a critical point where \[
\frac{d \mathcal{L}}{da} = 2 a^T B^T - 2 \lambda a^T W^T = 0,
\] that is, where $Ba = \lambda Wa$. If we let $W = D^T D$ (Cholesky decomposition), $C = D^{-1} B D^{-1}$, and $y = Da$, we obtain that our solution becomes \[
Cy = \lambda y,
\] and so we can convert our problem into an eigenvalue problem. It is clear that if $y_m$ and $\lambda_m$ are the maximal eigenvector and eigenvalue of the reduced problem, then $D^{-1} y_m$ and $\lambda_m$ are the maximal eigenvector and eigenvalue of the generalized problem, as required.
\end{proof}
\begin{exer}
Suppose that we have features $x \in \mathbb{R}^p$, a two-class response, with class sizes $N_1, N_2$, and the target coded as $-N/N_1, N/N_2$.
\begin{enumerate}
\item Show that the LDA rule classifies to class 2 if
\[
x^T \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1) > \frac{1}{2} \hat \mu_2^T \hat \Sigma^{-1} \hat \mu_2 - \frac{1}{2} \hat \mu_1^T \hat \Sigma^{-1} \hat \mu_1 + \log \frac{N_1}{N} - \log \frac{N_2}{N}
\]
\item Consider minimization of the least squares criterion
\[
\sum_{i=1}^N \left(y_i - \beta_0 - \beta^T x_i \right)^2
\]
Show that the solution $\hat \beta$ satisfies
\[
\left( (N-2) \hat \Sigma + \frac{N_1 N_2}{N} \hat \Sigma_B \right) \beta = N (\hat \mu_2 - \hat \mu_1 )
\] where $\hat \Sigma_B = (\hat \mu_2 - \hat \mu_1) (\hat \mu_2 - \hat \mu_1)^T$.
\item Hence show that $\hat \Sigma_B \beta$ is in the direction $(\hat \mu_2 - \hat \mu_1)$, and thus \[
\hat \beta \propto \hat \Sigma^{-1}(\hat \mu_2 - \hat \mu_1)
\] and therefore the least squares regression coefficient is identical to the LDA coefficient, up to a scalar multiple.
\item Show that this holds for any (distinct) coding of the two classes.
\item Find the solution $\hat \beta_0$, and hence the predicted values $\hat \beta_0 + \hat \beta^T x$. Consider the following rule: classify to class 2 if $\hat y_i > 0$ and class 1 otherwise. Show that this is not the same as the LDA rule unless the classes have equal numbers of observations.
\end{enumerate}
\end{exer}
\begin{proof}
We use the notation of Chapter 4.
\begin{enumerate}
\item Since in the two class case, we classify to class 2 if $\delta_1(x) < \delta_2(x)$. Substituting this into our equation for the Linear discriminant functions, we have \begin{align*}
\delta_1(x) &< \delta_2(x) \\
x^T \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1) &> \frac{1}{2} \hat \mu_2^T \hat \Sigma^{-1} \hat \mu_2 - \frac{1}{2} \hat \mu_1^T \hat \Sigma^{-1} \hat \mu_1 + \log \frac{N_1}{N} - \log \frac{N_2}{N}
\end{align*}
as required.
\item Let $U_i$ be the $n$ element vector with $j$-th element $1$ if the $j$-th observation is class $i$, and zero otherwise. Then we can write our target vector $Y$ as $t_1 U_1 + t_2 U_2$, where $t_i$ are our target labels, and we have $\mathbf{1} = U_1 + U_2$. Note that we can write our estimates $\hat \mu_1, \hat \mu_2$ as $X^T U_i = N_i \hat \mu_i$, and that $X^T Y = t_1 N_1 \hat \mu_1 + t_2 N_2 \hat \mu_2$.
By the least squares criterion, we can write \[
RSS = \sum_{i=1}^{N} (y_i - \beta_0 - \beta^T X)^2 = (Y - \beta_0 \mathbf{1} - X \beta)^T (Y - \beta_0 \mathbf{1} - X\beta)
\] Minimizing this with respect to $\beta$ and $\beta_0$, we obtain \begin{align*} 2 X^T X \beta - 2X^T Y + 2 \beta_0 X^T \mathbf{1} &= 0 \\ 2N \beta_0 - 2 \mathbf{1}^T (Y - X \beta) &= 0. \end{align*}
These equations can be solved for $\beta_0$ and $\beta$ by substitution as \begin{align*} \hat \beta_0 &= \frac{1}{N} \mathbf{1}^T (Y - X\beta) \\
\left(X^T X - \frac{1}{N}X^T \mathbf{1} \mathbf{1}^T X\right) \hat \beta &= X^T Y - \frac{1}{N} X^T \mathbf{1} \mathbf{1}^T Y
\end{align*}
The RHS can be written as \begin{align*}
X^T Y - \frac{1}{N} X^T \mathbf{1} \mathbf{1}^T Y &= t_1 N_1 \hat \mu_1 + t_2 N_2 \hat \mu_2 - \frac{1}{N} (N_1 \hat \mu_1 + N_2 \hat \mu_2)(t_1 N_1 + t_2 N_2) \\
&= \frac{N_1 N_2}{N} (t_1 - t_2) (\hat \mu_1 - \hat \mu_2)
\end{align*} where we use our relations for $X^T U_i$ and the fact that $\mathbf{1} = U_1 + U_2$.
Similarly, the bracketed term on the LHS of our expression for $\beta$ can be rewritten as \begin{align*}
X^T X = (N-2) \hat \Sigma + N_1 \hat \mu_1 \hat \mu_1^T + N_2 \hat \mu_2 \hat \mu_2^T,
\end{align*} and by substituting in the above and the definition of $\hat \Sigma_B$, we can write \begin{align*}
X^T X - \frac{1}{N}X^T \mathbf{1} \mathbf{1}^T X &= (N-2) \hat \Sigma + \frac{N_1 N_2}{N} \hat \Sigma_B
\end{align*} as required.
Putting this together, we obtain our required result, \[
\left( (N-2) \hat \Sigma + \frac{N_1 N_2}{N} \hat \Sigma_B \right) \hat \beta = \frac{N_1 N_2}{N} (t_1 - t_2)(\hat \mu_1 - \hat \mu_2),
\]
and then substituting $t_1 = -N/N_1, t_2 = N/N_2$, we obtain our required result, \[
\left( (N-2) \hat \Sigma + \frac{N_1 N_2}{N} \hat \Sigma_B \right) \hat \beta = N(\hat \mu_2 - \hat \mu_1)
\]
\item All that is required is to show that $\hat \Sigma_B \beta$ is in the direction of $(\hat \mu_2 - \hat \mu_1)$. This is clear from the fact that \[
\hat \Sigma_B \hat \beta = (\hat \mu_2 - \hat \mu_1)(\hat \mu_2 - \hat \mu_1)^T \hat \beta = \lambda (\hat \mu_2 - \hat \mu_1)
\] for some $\lambda \in \mathbb{R}$. Since $\hat \Sigma \hat \beta$ is a linear combination of terms in the direction of $(\hat \mu_2 - \hat \mu_1)$, we can write \[
\hat \beta \propto \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1)
\] as required.
\item Since our $t_1, t_2$ were arbitrary and distinct, the result follows.
\item From above, we can write \begin{align*}
\hat \beta_0 &= \frac{1}{N} \mathbf{1}^T (Y - X \hat \beta) \\
&= \frac{1}{N}(t_1 N_1 + t_2 N_2) - \frac{1}{N} \mathbf{1}{^T} X \hat \beta \\
&= -\frac{1}{N}(N_1 \hat \mu_1^T + N_2 \hat \mu_2^T) \hat \beta.
\end{align*}
We can then write our predicted value $\hat f(x) = \hat \beta_0 + \hat \beta^T x$ as \begin{align*}
\hat f(x) &= \frac{1}{N}\left( N x^T - N_1 \hat \mu_1^T - N_2 \hat \mu_2^T \right) \hat \beta \\
&= \frac{1}{N}\left( N x^T - N_1 \hat \mu_1^T - N_2 \hat \mu_2^T \right) \lambda \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1)
\end{align*} for some $\lambda \in \mathbb{R}$, and so our classification rule is $\hat f(x) > 0$, or equivalently, \begin{align*}
N x^T \lambda \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1) > (N_1 \hat \mu_1^T + N_2 \hat \mu_2^T) \lambda \hat \Sigma^{-1}(\hat \mu_2 - \hat \mu_1) \\
x^T \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1) > \frac{1}{N} \left( N_1 \hat \mu^T_1 + N_2 \hat \mu_2^T \right) \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1)
\end{align*} which is different to the LDA decision rule unless $N_1 = N_2$.
\end{enumerate}
\end{proof}
\begin{exer}
Suppose that we transform the original predictors $X$ to $\hat Y$ by taking the predicted values under linear regression. Show that LDA using $\hat Y$ is identical to using LDA in the original space.
\end{exer}
\begin{exer}
Consier the multilogit model with $K$ classes. Let $\beta$ be the $(p+1)(K-1)$-vector consisting of all the coefficients. Define a suitable enlarged version of the input vector $x$ to accommodate this vectorized coefficient matrix. Derive the Newton-Raphson algorithm for maximizing the multinomial log-likelihood, and describe how you would implement the algorithm.
\end{exer}
\begin{exer}
Consider a two-class regression problem with $x \in \mathbb{R}$. Characterise the MLE of the slope and intercept parameter if the sample $x_i$ for the two classes are separated by a point $x_0 \in \mathbb{R}$. Generalise this result to $x \in \mathbb{R}^p$ and more than two classes.
\end{exer}
\begin{exer}
Suppose that we have $N$ points $x_i \in \mathbb{R}^p$ in general position, with class labels $y_i \in \{-1, 1 \}$. Prove that the perceptron learning algorithm converges to a separating hyperplane in a finite number of steps.
\begin{enumerate}
\item Denote a hyperplane by $f(x) = \beta^T x^\star = 0$. Let $z_i = \frac{x_i^\star}{\| x_i^\star \|}$. Show that separability implies the existence of a $\beta_{\text{sep}}$ such that $y_i \beta_{\text{sep}}^T z_i \geq 1$ for all $i$.
\item Given a current $\beta_{\text{old}}$, the perceptron algorithm identifies a pint $z_i$ that is misclassified, and produces the update $\beta_{\text{new}} \leftarrow \beta_{\text{old}} + y_i z_i$. Show that
\[
\| \beta_{\text{new}} - \beta_{\text{sep}} \|^2 \leq \| \beta_{\text{old}} - \beta_{\text{sep}} \|^2 - 1
\] and hence that the algorithm converges to a separating hyperplane in no more than $\| \beta_{\text{start}} - \beta_{\text{sep}} \|^2$ steps.
\end{enumerate}
\end{exer}
\begin{proof}
Recall that the definition of separability implies the existence of a separating hyperplane - that is, a vector $\beta_\text{sep}$ such that $\text{sgn}\left( \beta^T_\text{sep} x^\star_i \right) = y_i$.
\begin{enumerate}
\item By assumption, there exists $\epsilon > 0$ and $\beta_\text{sep}$ such that \[
y_i \beta^T_\text{sep} z^\star_i \geq \epsilon
\] for all $i$. Then the hyperplane $\frac{1}{\epsilon} \beta_\text{sep}$ is a separating hyperplane that by linearity satisfies the constraint \[
y_i \beta^T_\text{sep} z^\star_i \geq 1.
\]
\item We have \begin{align*}
\| \beta_\text{new} - \beta_\text{sep} \|^2 &= \| \beta_\text{new} \|^2 + \| \beta_\text{sep} \|^2 - 2 \beta_\text{sep}^T \beta_\text{new} \\
&= \| \beta_\text{old} + y_i z_i \|^2 + \| \beta_\text{sep} \|^2 - 2 \beta_\text{sep}^T \left( \beta_\text{old} + y_i z_i \right) \\
&= \| \beta_\text{old} \|^2 + \| y_i z_i \|^2 + 2 y_i \beta_\text{old}^T z_i + \| \beta_\text{sep} \|^2 - 2 \beta_\text{sep}^T \beta_0 - 2 y_i \beta^T_\text{sep} z_i \\
&\leq \| \beta_\text{old} \|^2 + \| \beta_\text{sep} \|^2 - 2 \beta_\text{sep}^T \beta_\text{old} + 1 - 2 \\
&= \| \beta_\text{old} - \beta_\text{sep} \|^2 - 1.
\end{align*} Let $\beta_k, k = 0, 1, 2, \dots$ be the sequence of iterates formed by this procedure, with $\beta_0 = \beta_\text{start}$. Let $k^\star = \left\lceil \| \beta_\text{start} - \beta_\text{sep} \|^2 \right\rceil$.
Then by the above result, we must have $\| \beta_{k^\star} - \beta_\text{sep} \|^2 = 0$, and by properties of the norm we have that $\beta_{k^\star} = \beta_\text{sep}$, and so we have reached a separating hyperplane in no more than $k^\star$ steps.
\end{enumerate}
\end{proof}