-
Notifications
You must be signed in to change notification settings - Fork 86
/
Copy pathpca.jl
285 lines (220 loc) · 8.5 KB
/
pca.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# Principal Component Analysis
"""
Linear Principal Component Analysis
"""
struct PCA{T<:Real} <: LinearDimensionalityReduction
mean::AbstractVector{T} # sample mean: of length d (mean can be empty, which indicates zero mean)
proj::AbstractMatrix{T} # projection matrix: of size d x p
prinvars::AbstractVector{T} # principal variances: of length p
tprinvar::T # total principal variance, i.e. sum(prinvars)
tvar::T # total input variance
end
## constructor
function PCA(mean::AbstractVector{T}, proj::AbstractMatrix{T}, pvars::AbstractVector{T}, tvar::T) where {T<:Real}
d, p = size(proj)
(isempty(mean) || length(mean) == d) ||
throw(DimensionMismatch("Dimensions of mean and projection matrix are inconsistent."))
length(pvars) == p ||
throw(DimensionMismatch("Dimensions of projection matrix and principal variables are inconsistent."))
tpvar = sum(pvars)
tpvar <= tvar || isapprox(tpvar,tvar) || throw(ArgumentError("principal variance cannot exceed total variance."))
PCA(mean, proj, pvars, tpvar, tvar)
end
## properties
"""
size(M)
Returns a tuple with the dimensions of input (the dimension of the observation space)
and output (the dimension of the principal subspace).
"""
size(M::PCA) = size(M.proj)
"""
mean(M::PCA)
Returns the mean vector (of length `d`).
"""
mean(M::PCA) = fullmean(size(M.proj,1), M.mean)
"""
projection(M::PCA)
Returns the projection matrix (of size `(d, p)`). Each column of the projection matrix corresponds to a principal component.
The principal components are arranged in descending order of the corresponding variances.
"""
projection(M::PCA) = M.proj
"""
eigvecs(M::PCA)
Get the eigenvalues of the PCA model `M`.
"""
eigvecs(M::PCA) = projection(M)
"""
principalvars(M::PCA)
Returns the variances of principal components.
"""
principalvars(M::PCA) = M.prinvars
principalvar(M::PCA, i::Int) = M.prinvars[i]
"""
eigvals(M::PCA)
Get the eigenvalues of the PCA model `M`.
"""
eigvals(M::PCA) = principalvars(M)
"""
tprincipalvar(M::PCA)
Returns the total variance of principal components, which is equal to `sum(principalvars(M))`.
"""
tprincipalvar(M::PCA) = M.tprinvar
"""
tresidualvar(M::PCA)
Returns the total residual variance.
"""
tresidualvar(M::PCA) = M.tvar - M.tprinvar
"""
var(M::PCA)
Returns the total observation variance, which is equal to `tprincipalvar(M) + tresidualvar(M)`.
"""
var(M::PCA) = M.tvar
"""
r2(M::PCA)
principalratio(M::PCA)
Returns the ratio of variance preserved in the principal subspace, which is equal to `tprincipalvar(M) / var(M)`.
"""
r2(M::PCA) = M.tprinvar / M.tvar
const principalratio = r2
"""
loadings(M::PCA)
Returns model loadings, i.e. the weights for each original variable when calculating the principal component.
"""
loadings(M::PCA) = sqrt.(principalvars(M))' .* projection(M)
## use
"""
predict(M::PCA, x::AbstractVecOrMat{<:Real})
Given a PCA model `M`, retur transform observations `x` into principal components space, as
\$\\mathbf{y} = \\mathbf{P}^T (\\mathbf{x} - \\boldsymbol{\\mu})\$
Here, `x` can be either a vector of length `d` or a matrix where each column is an observation,
and `\\mathbf{P}` is the projection matrix.
"""
predict(M::PCA, x::AbstractVecOrMat{T}) where {T<:Real} = transpose(M.proj) * centralize(x, M.mean)
"""
reconstruct(M::PCA, y::AbstractVecOrMat{<:Real})
Given a PCA model `M`, returns a (approximately) reconstructed observations
from principal components space, as
\$\\tilde{\\mathbf{x}} = \\mathbf{P} \\mathbf{y} + \\boldsymbol{\\mu}\$
Here, `y` can be either a vector of length `p` or a matrix where each column
gives the principal components for an observation, and \$\\mathbf{P}\$ is the projection matrix.
"""
reconstruct(M::PCA, y::AbstractVecOrMat{T}) where {T<:Real} = decentralize(M.proj * y, M.mean)
## show & dump
function show(io::IO, M::PCA)
idim, odim = size(M)
print(io, "PCA(indim = $idim, outdim = $odim, principalratio = $(r2(M)))")
end
#### PCA Training
## auxiliary
const default_pca_pratio = 0.99
function check_pcaparams(d::Int, mean::AbstractVector, md::Int, pr::Real)
isempty(mean) || length(mean) == d ||
throw(DimensionMismatch("Incorrect length of mean."))
md >= 1 || error("`maxoutdim` parameter must be a positive integer.")
0.0 < pr <= 1.0 || throw(ArgumentError("principal ratio must be a positive real value ≤ 1.0."))
end
function choose_pcadim(v::AbstractVector{T}, ord::Vector{Int}, vsum::T, md::Int,
pr::Real) where {T<:Real}
md = min(length(v), md)
k = 1
a = v[ord[1]]
thres = vsum * convert(T, pr)
while k < md && a < thres
a += v[ord[k += 1]]
end
return k
end
## core algorithms
"""
pcacov(C, mean; ...)
Compute and return a PCA model based on eigenvalue decomposition of a given covariance matrix `C`.
**Parameters:**
- `C`: The covariance matrix of the samples.
- `mean`: The mean vector of original samples, which can be a vector of length `d`,
or an empty vector `Float64[]` indicating a zero mean.
*Note:* This function accepts two keyword arguments: `maxoutdim` and `pratio`.
"""
function pcacov(C::AbstractMatrix{T}, mean::AbstractVector{T};
maxoutdim::Int=size(C,1),
pratio::Real=default_pca_pratio) where {T<:Real}
check_pcaparams(size(C,1), mean, maxoutdim, pratio)
Eg = eigen(Symmetric(C))
ev = Eg.values
ord = sortperm(ev; rev=true)
vsum = sum(ev)
k = choose_pcadim(ev, ord, vsum, maxoutdim, pratio)
v, P = extract_kv(Eg, ord, k)
PCA(mean, P, v, vsum)
end
"""
pcasvd(Z, mean, tw; ...)
Compute and return a PCA model based on singular value decomposition of a centralized sample matrix `Z`.
**Parameters:**
- `Z`: a matrix of centralized samples.
- `mean`: The mean vector of the **original** samples, which can be a vector of length `d`,
or an empty vector `Float64[]` indicating a zero mean.
- `n`: a number of samples.
*Note:* This function accepts two keyword arguments: `maxoutdim` and `pratio`.
"""
function pcasvd(Z::AbstractMatrix{T}, mean::AbstractVector{T}, n::Real;
maxoutdim::Int=min(size(Z)...),
pratio::Real=default_pca_pratio) where {T<:Real}
check_pcaparams(size(Z,1), mean, maxoutdim, pratio)
Svd = svd(Z)
v = Svd.S::Vector{T}
U = Svd.U::Matrix{T}
for i = 1:length(v)
@inbounds v[i] = abs2(v[i]) / n
end
ord = sortperm(v; rev=true)
vsum = sum(v)
k = choose_pcadim(v, ord, vsum, maxoutdim, pratio)
si = ord[1:k]
PCA(mean, U[:,si], v[si], vsum)
end
## interface functions
"""
fit(PCA, X; ...)
Perform PCA over the data given in a matrix `X`. Each column of `X` is an **observation**.
**Keyword arguments**
- `method`: The choice of methods:
- `:auto`: use `:cov` when `d < n` or `:svd` otherwise (*default*).
- `:cov`: based on covariance matrix decomposition.
- `:svd`: based on SVD of the input data.
- `maxoutdim`: The output dimension, i.e. dimension of the transformed space (*min(d, nc-1)*)
- `pratio`: The ratio of variances preserved in the principal subspace (*0.99*)
- `mean`: The mean vector, which can be either of
- `0`: the input data has already been centralized
- `nothing`: this function will compute the mean (*default*)
- a pre-computed mean vector
**Notes:**
- The output dimension `p` depends on both `maxoutdim` and `pratio`, as follows. Suppose
the first `k` principal components preserve at least `pratio` of the total variance, while the
first `k-1` preserves less than `pratio`, then the actual output dimension will be \$\\min(k, maxoutdim)\$.
- This function calls [`pcacov`](@ref) or [`pcasvd`](@ref) internally, depending on the choice of method.
"""
function fit(::Type{PCA}, X::AbstractMatrix{T};
method::Symbol=:auto,
maxoutdim::Int=size(X,1),
pratio::Real=default_pca_pratio,
mean=nothing) where {T<:Real}
@assert !SparseArrays.issparse(X) "Use Kernel PCA for sparse arrays"
d, n = size(X)
# choose method
if method == :auto
method = d < n ? :cov : :svd
end
# process mean
mv = preprocess_mean(X, mean)
# delegate to core
if method == :cov
C = covm(X, isempty(mv) ? 0 : mv, 2)
M = pcacov(C, mv; maxoutdim=maxoutdim, pratio=pratio)
elseif method == :svd
Z = centralize(X, mv)
M = pcasvd(Z, mv, n; maxoutdim=maxoutdim, pratio=pratio)
else
throw(ArgumentError("Invalid method name $(method)"))
end
return M::PCA
end