-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path09_perceptron.tex
executable file
·310 lines (233 loc) · 11.1 KB
/
09_perceptron.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
\section{Rosenblatt's Perceptron (1957)}
\subsection{Motivation}
\begin{frame}
\frametitle{Motivation}
\begin{itemize}
\item We want to compute a linear decision boundary. \\[.5cm]
\item We assume that classes are linearly separable. \\[.5cm]
\item Computation of a linear separating hyperplane that\\
minimizes the distance of misclassified feature vectors \\
to the decision boundary.
\end{itemize}
\end{frame}
\subsection{Objective Function}
\begin{frame}
\frametitle{Objective Function}
Assume the following:
\begin{itemize}
\item Class numbers are $y=\pm 1$.
\item The decision boundary is a linear function:
\begin{displaymath}
y^* = \mbox{sgn}(\vec \alpha^T\vec x + \alpha_0).
\end{displaymath}
\pause
\item Parameters $\alpha_0$ and $\vec \alpha$ are chosen according to the optimization problem \\[.2cm]
\begin{center}
\tikz[baseline]{
\node[fill=bl1!100,anchor=base,rounded corners=3pt] (d1) {
\color{bl3}
$\displaystyle \mbox{minimize} \quad \bigg\{ D(\alpha_0, \vec \alpha)= -\sum_{\vec x_i\in {\cal M}} y_i \cdot (\vec \alpha^T\vec x_i + \alpha_0) \bigg\}$
};
}\\[.2cm]
\end{center}
where $\cal M$ includes the misclassified feature vectors.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Objective Function \cont}
\begin{itemize}
\item The elements of the sum in the objective function depend on \\
the set of misclassified feature vectors $\cal M$. \\[.5cm] \pause
\item In each iteration step the cardinality of $\cal M$ might change. \\[.5cm] \pause
\item The cardinality of $\cal M$ is a discrete variable. \\[.5cm] \pause
\item \structure{Competing variables:} continuous parameters of linear decision boundary and the discrete cardinality of $\cal M$.
\end{itemize}
\end{frame}
\subsection{Minimization of Objective Function}
\begin{frame}
\frametitle{Minimization of Objective Function}
Remember the objective function $D(\alpha_0, \vec \alpha)$:
\begin{displaymath}
\mbox{minimize} \quad \quad D(\alpha_0, \vec \alpha)= -\sum_{\vec x_i\in {\cal M}} y_i \cdot (\vec \alpha^T\vec x_i + \alpha_0)
\end{displaymath}
\pause
The gradient of the objective function is: \pause
\begin{eqnarray*}
\frac{\partial}{\partial \alpha_0} \ D(\alpha_0, \vec \alpha)&=& -\sum_{\vec x_i\in {\cal M}} y_i \\ \pause
\frac{\partial}{\partial \vec\alpha} \ D(\alpha_0, \vec \alpha)&=& -\sum_{\vec x_i\in {\cal M}} y_i\cdot \vec x_i \\
\end{eqnarray*}
\end{frame}
\begin{frame}
\frametitle{Minimization of Objective Function \cont}
We want to take an update step right after having visited each misclassified observation.
The update rule in the $(k+1)$-st iteration step is:
\begin{displaymath}
{ \alpha_0^{(k+1)} \choose \vec \alpha^{(k+1)}} = \pause { \alpha_0^{(k)} \choose \vec \alpha^{(k)}} + \lambda {y_i \choose y_i\cdot \vec x_i}
\end{displaymath}
Here $\lambda$ is the learning rate which can be set to $1$ without loss of generality.
\end{frame}
\begin{frame}
\frametitle{Minimization of Objective Function \cont}
\begin{algorithmic}
\STATE \structure{Input:} training data: $S = \{ (\vec x_1, y_1), (\vec x_2, y_2), (\vec x_3, y_3), \dots, (\vec x_m, y_m) \}$ \pause
\STATE initialize $k=0$, $\alpha^{(0)}_0=0$ and $\vec \alpha^{(0)}= \vec 0$
\REPEAT
\STATE select pair $(\vec x_i, y_i)$ from training set. \pause
\IF {$y_i\cdot (\vec x_i^T \vec\alpha^{(k)} + \alpha_0^{(k)})\leq 0$}
\STATE \
\STATE $\displaystyle{ \alpha_0^{(k+1)} \choose \vec \alpha^{(k+1)}} = { \alpha_0^{(k)} \choose \vec \alpha^{(k)}} + {y_i \choose y_i\cdot \vec x_i}$
\STATE \
\STATE $k\leftarrow k+1$
\ENDIF
\pause
\UNTIL{$y_i\cdot (\vec x_i^T \vec\alpha^{(k)} + \alpha_0^{(k)})> 0 \quad \mbox {for all} \quad i$}
\STATE \structure{Output:} $\alpha_0^{(k)}$ and $\vec \alpha^{(k)}$
\end{algorithmic}
\end{frame}
\subsection{Remarks on Perceptron Learning}
\begin{frame}
\frametitle{Remarks on Perceptron Learning}
\begin{itemize}
\item The update rule is extremely simple. \\[.3cm]
\item Nothing happens if we classify all $\vec x_i$ correctly using the given linear decision boundary. \\[.3cm]
\item The parameter $\vec \alpha$ of the decision boundary is a linear combination of feature vectors. \\[.3cm] \pause
\item The decision boundary thus is:
\begin{displaymath}
F(\vec x) = \left (\sum_{i\in \cal E} y_i\cdot \vec x_i \right)^T \vec x + \sum_{i\in \cal E} y_i \quad
= \quad \sum_{i\in \cal E} y_i\cdot \langle \vec x_i, \vec x\rangle + \sum_{i\in \cal E} y_i
\end{displaymath}
where $\cal E$ is the list of indices that required an update \\
(indices may appear more than once).
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Remarks on Perceptron Learning \cont}
\begin{itemize}
\item The final linear decision boundary depends on the initialization, \\
i.\,e.\ $\alpha_0^{(0)}$ and $\vec \alpha^{(0)}$. \\[.5cm]
\item The number of iterations can be rather large. \\[.5cm]
\item If data are not linearly separable, the proposed learning algorithm will not converge. The algorithm will end up in hard to detect cycles.
\end{itemize}
\end{frame}
\subsection{Convergence of Learning Algorithm}
\begin{frame}
\frametitle{Convergence of Learning Algorithm}
\begin{theorem}[Convergence Theorem of Rosenblatt and Novikoff]
Assume that for all $i=1,2, \dots, m$
\begin{displaymath}
y_i(\vec x_i^T\vec \alpha^* +\alpha^*_0)\geq \rho
\end{displaymath}
where $\rho>0$ and $\|\vec \alpha^*\|=1$.
Let $M= \max_i \|\vec x_i\|_2$. \\[.3cm]
The perceptron learning algorithm converges to a linear decision boundary after $k$ iterations, where $k$ is bounded by
\begin{displaymath}
k \leq \frac{(\alpha_0^{*2}+1)(1+M^2)}{\rho^2}.
\end{displaymath}
\end{theorem}
\end{frame}
\begin{frame}
\frametitle{Convergence of Learning Algorithm \cont}
Let us look at the estimated parameters after $k$ iterations and \
how the parameters change with iterations:
\begin{eqnarray*}
{\alpha_0^{(k)} \choose \vec \alpha^{(k)}}^T{\alpha_0^* \choose \vec \alpha^*}
&=& \pause \left( { \alpha_0^{(k-1)} \choose \vec \alpha^{(k-1)}} + {y_i \choose y_i\cdot \vec x_i}\right)^T {\alpha_0^* \choose \vec \alpha^*}\\ \pause
&=& { \alpha_0^{(k-1)} \choose \vec \alpha^{(k-1)}} ^T{\alpha_0^* \choose \vec \alpha^*} + {y_i \choose y_i\cdot \vec x_i}^T {\alpha_0^* \choose \vec \alpha^*} \\ \pause
&\geq& { \alpha_0^{(k-1)} \choose \vec \alpha^{(k-1)}} ^T{\alpha_0^* \choose \vec \alpha^*} + \rho \\ \pause
&\geq& k\rho
\end{eqnarray*}
\structure{Conclusion:} The more iterations (i.\,e.\ misclassifications) we have, \\
the more the vectors are aligned.
\end{frame}
\begin{frame}
\frametitle{Convergence of Learning Algorithm \cont}
Now we apply Cauchy-Schwartz inequality for inner products:
\begin{eqnarray*}
{\alpha_0^{(k)} \choose \vec \alpha^{(k)}}^T{\alpha_0^* \choose \vec \alpha^*}
&\leq& \left\| {\alpha_0^{(k)} \choose \vec \alpha^{(k)}}\right\|_2\cdot \left\| {\alpha_0^* \choose \vec \alpha^*}\right\|_2 \\[.5cm] \pause
&=&\left \| {\alpha_0^{(k)} \choose \vec \alpha^{(k)}}\right\|_2\cdot \sqrt{\alpha_0^{*2}+1}
\end{eqnarray*}
\end{frame}
\begin{frame}
\frametitle{Convergence of Learning Algorithm \cont}
The norm of the vector estimated in the $k$-th iteration step is:
\begin{eqnarray*}
& & \left \| {\alpha_0^{(k)} \choose \vec \alpha^{(k)}}\right\|_2^2 =
\left\| { \alpha_0^{(k-1)} \choose \vec \alpha^{(k-1)}} + {y_i \choose y_i\cdot \vec x_i}\right\|_2^2 \\[.5cm] \pause
& & \vspace*{-2cm} = { \alpha_0^{(k-1)} \choose \vec \alpha^{(k-1)}} ^T{ \alpha_0^{(k-1)} \choose \vec \alpha^{(k-1)}}
+ 2 { \alpha_0^{(k-1)} \choose \vec \alpha^{(k-1)}}^T{y_i \choose y_i\cdot \vec x_i}
+ {y_i \choose y_i\cdot \vec x_i}^T{y_i \choose y_i\cdot \vec x_i}
\end{eqnarray*}
\end{frame}
\begin{frame}
\frametitle{Convergence of Learning Algorithm \cont}
We only go into iteration step $(k+1)$ if we did a mistake in iteration $k$. \\
A misclassification implies:
\begin{displaymath}
{ \alpha_0^{(k)} \choose \vec \alpha^{(k)}}^T{y_i \choose y_i\cdot \vec x_i}
= y_i\cdot (\vec x_i^T \vec\alpha^{(k)} + \alpha_0^{(k)})\quad < \quad 0
\end{displaymath}
\pause
And thus we get
\begin{displaymath}
\left \| {\alpha_0^{(k)} \choose \vec \alpha^{(k)}}\right\|_2^2
\leq { \alpha_0^{(k-1)} \choose \vec \alpha^{(k-1)}} ^T{ \alpha_0^{(k-1)} \choose \vec \alpha^{(k-1)}}
+ {y_i \choose y_i\cdot \vec x_i}^T{y_i \choose y_i\cdot \vec x_i}
\leq k(1+M^2)
\end{displaymath}
\end{frame}
\begin{frame}
\frametitle{Convergence of Learning Algorithm \cont}
Wrap-up:
\begin{displaymath}
k\rho
\leq {\alpha_0^{(k)} \choose \vec \alpha^{(k)}}^T{\alpha_0^* \choose \vec \alpha^*}
\leq \left \| {\alpha_0^{(k)} \choose \vec \alpha^{(k)}}\right\|_2\cdot \sqrt{\alpha_0^{*2}+1}
\end{displaymath}
\pause
Using Cauchy-Schwartz:
\begin{displaymath}
k\rho\quad
\leq \quad \left \| {\alpha_0^{(k)} \choose \vec \alpha^{(k)}}\right\|_2\cdot \sqrt{\alpha_0^{*2}+1}\quad \leq \quad \sqrt{k(1+M^2)(\alpha_0^{*2}+1)}
\end{displaymath}
\pause
shows:
\begin{displaymath}
k \quad \leq\quad \frac{(\alpha_0^{*2}+1)(1+M^2)}{\rho^2}
\end{displaymath}
\end{frame}
\subsection{Lessons Learned}
\begin{frame}
\frametitle{Lessons Learned}
\begin{itemize}
\item Objective function changes in each iteration step. \\[.5cm]
\item Optimization problem is discrete. \\[.5cm]
\item Very simple learning rule. \\[.5cm]
\item \vorsicht \structure{Very important:} Number of iterations does \structure{not} depend on the \\
dimension of the feature vectors.
\end{itemize}
\end{frame}
\input{nextTime.tex}
\subsection{Further Readings}
\begin{frame}
\frametitle{Further Readings}
\begin{itemize}
\item Brian D. Ripley: \\
\structure{Pattern Recognition and Neural Networks}, \\
Cambridge University Press, Cambridge, 1996.\\[.3cm]
\item T. Hastie, R. Tibshirani, and J. Friedman: \\
\structure{The Elements of Statistical Learning --}\\
\structure{ Data Mining, Inference, and Prediction},\\
2nd edition, Springer, New York, 2009.
\end{itemize}
\end{frame}
\subsection{Comprehensive Questions}
\begin{frame}
\frametitle{Comprehensive Questions}
\begin{itemize}
\item What is Rosenblatt's perceptron? \\[1cm]
\item What is the objective function for Rosenblatt's perceptron? \\[1cm]
\item Why is the optimization of the objective function nonlinear? \\[1cm]
\item When and how does Rosenblatt's perceptron algorithm converge?
\end{itemize}
\end{frame}