-
Notifications
You must be signed in to change notification settings - Fork 1
/
pca.py
178 lines (138 loc) · 5.5 KB
/
pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# -*- coding: utf-8 -*-
import numpy as np
def flip_signs(A, B):
"""
utility function for resolving the sign ambiguity in SVD
http://stats.stackexchange.com/q/34396/115202
"""
signs = np.sign(A) * np.sign(B)
return A, B * signs
class PCA(object):
"""
Principal Component Analysis
Parameters
----------
n_components : int
Number of components to keep.
if n_components is not set all components are kept
Attributes
----------
explained_variance : array, [n_components]
The amount of variance explained by each of the selected components.
explained_variance_cumsum : array, [n_components]
Cumulative sum of explained variance for every component.
total_var : float
Total variance explained. Equal to `sum(explained_variance)`
mean_ : array, [n_features]
Per-feature empirical mean, estimated from the training set.
Equal to `X.mean(axis=0)`.
n_components_ : int
The estimated number of components.
Example usage
-------------
X = np.random.rand(14,88) # 2D ndarray
model = PCA(2)
eig_projection = model.eigen(X)
svd_projection = model.svd(X)
svd_projection2D = model(X) # callable class
eig_projection2D = model(X, method='eig') # eigenvector method
svd_projection2D[0] # first component with max variance
svd_projection2D[-1] # last component with least variance
"""
def __init__(self, n_components=None):
self.n_components = n_components
def __call__(self, X, method=None):
if method is None:
return self.svd(X)
return self.eigen(X)
@staticmethod
def scale(X):
"""
Standardize features by removing the mean and scaling to unit variance
"""
X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
return X_std
def eigen(self, X):
"""
Compute the eigenvalues and right eigenvectors of a square array.
Project them to new feature space
"""
assert X.ndim == 2
n_samples, n_features = X.shape
# Estimate if n_components not specified
self.n_components_ = self.n_components or n_features
self.mean_ = np.mean(X, axis=0)
k = self.n_components
# Standardizing the data
X = self.scale(X)
# Eigendecomposition - Computing Eigenvectors and Eigenvalues
# Covariance matrix after standardizing the data
# equals to correlation matrix of raw or std data
cov_mat = np.corrcoef(X, rowvar=False)
print('Covariance matrix: \n%s' % cov_mat)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
# Sorting eigenpairs wrt. eigenvalues
idx = eig_vals.argsort()[::-1]
eig_vals, eig_vecs = eig_vals[idx], eig_vecs[:, idx]
# the eigenvalues in decreasing order
print("Eigenvalues:\n", eig_vals[:k])
# a matrix of eigenvectors (each column is an eigenvector)
print("Eigenvectors:\n", eig_vecs[:k])
# Explained Variance
self.explained_variance = (eig_vals / np.sum(eig_vals))[:k]
self.explained_variance_cumsum = np.cumsum(self.explained_variance)
self.total_var = np.sum(self.explained_variance)
print('Total variance explained:\n', self.total_var)
print('Explained variance:\n', self.explained_variance)
print('Explained variance cumsum:\n', self.explained_variance_cumsum)
# Projection matrix and dimensionality reduction
W = eig_vecs[:, 0:k]
assert W.shape == (n_features, self.n_components_)
# projections of X on the principal axes are called principal components
PC_k = X.dot(W)
assert PC_k.shape == (n_samples, self.n_components_)
# expose variables as attributes for unit testing
self.X = X
self.cov_mat = cov_mat
self.eig_vals = eig_vals
self.eig_vecs = eig_vecs
return PC_k
def svd(self, X):
"""
Singular Value Decomposition
"""
assert X.ndim == 2
n_samples, n_features = X.shape
# Estimate if n_components not specified
self.n_components_ = self.n_components or n_features
self.mean_ = np.mean(X, axis=0)
k = self.n_components
# Standardizing the data
X = self.scale(X)
# we now perform singular value decomposition of X
# "economy size" (or "thin") SVD
U, s, Vt = np.linalg.svd(X, full_matrices=False)
V = Vt.T
S = np.diag(s)
# 1) columns of V are principal directions/axes.
# np.testing.assert_allclose(*flip_signs(V, principal_axes))
# 2) columns of US are principal components
# np.testing.assert_allclose(*flip_signs(U.dot(S), principal_components), rtol=1e-04)
# 3) singular values are related to the eigenvalues of covariance matrix
# np.testing.assert_allclose((s ** 2) / (n - 1), eig_vals)
eig_vals = (s ** 2) / (n_features - 1)
# Explained Variance
self.explained_variance = (eig_vals / np.sum(eig_vals))[:k]
self.explained_variance_cumsum = np.cumsum(self.explained_variance)
self.total_var = np.sum(self.explained_variance)
# dimensionality reduction
US_k = U[:, 0:k].dot(S[0:k, 0:k])
assert US_k.shape == (n_samples, self.n_components_)
# expose variables as attributes for unit testing
self.X = X
self.U = U
self.s = s
self.Vt = Vt
self.V = V
self.S = S
return US_k