-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathxmeans.py
executable file
·229 lines (189 loc) · 7.27 KB
/
xmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env python
"""Class that implements X-means."""
import argparse
import numpy as np
import logging
import time
import pylab as plt
import scipy.cluster.vq as vq
from scipy.spatial import distance
class XMeans:
def __init__(self, X, init_K=2, plot=False, seed=None):
self.X = X
self.init_K = init_K
self.plot = plot
if seed != None:
np.random.seed(seed)
def estimate_K_xmeans(self, th=0.2, maxK = 10):
"""Estimates K running X-means algorithm (Pelleg & Moore, 2000)."""
# Run initial K-means
means, labels = self.run_kmeans(self.X, self.init_K)
# Run X-means algorithm
stop = False
curr_K = self.init_K
while not stop:
stop = True
final_means = []
for k in xrange(curr_K):
# Find the data that corresponds to the k-th cluster
D = self.get_clustered_data(self.X, labels, k)
if len(D) == 0 or D.shape[0] == 1:
continue
# Whiten and find whitened mean
stdD = np.std(D, axis=0)
#D = vq.whiten(D)
D /= stdD # Same as line above
mean = D.mean(axis=0)
# Cluster this subspace by half (K=2)
half_means, half_labels = self.run_kmeans(D, K=2)
# Compute BICs
bic1 = self.compute_bic(D, [mean], K=1,
labels=np.zeros(D.shape[0]),
R=D.shape[0])
bic2 = self.compute_bic(D, half_means, K=2,
labels=half_labels, R=D.shape[0])
# Split or not
max_bic = np.max([np.abs(bic1), np.abs(bic2)])
norm_bic1 = bic1 / max_bic
norm_bic2 = bic2 / max_bic
diff_bic = np.abs(norm_bic1 - norm_bic2)
# Split!
print "diff_bic", diff_bic
if diff_bic > th:
final_means.append(half_means[0] * stdD)
final_means.append(half_means[1] * stdD)
curr_K += 1
stop = False
# Don't split
else:
final_means.append(mean * stdD)
final_means = np.asarray(final_means)
print "Estimated K: ", curr_K
if self.plot:
plt.scatter(self.X[:, 0], self.X[:, 1])
plt.scatter(final_means[:, 0], final_means[:, 1], color="y")
plt.show()
if curr_K >= maxK or self.X.shape[-1] != final_means.shape[-1]:
stop = True
else:
labels, dist = vq.vq(self.X, final_means)
return curr_K
def estimate_K_knee(self, th=.015, maxK=12):
"""Estimates the K using K-means and BIC, by sweeping various K and
choosing the optimal BIC."""
# Sweep K-means
if self.X.shape[0] < maxK:
maxK = self.X.shape[0]
if maxK < 2:
maxK = 2
K = np.arange(1, maxK)
bics = []
for k in K:
means, labels = self.run_kmeans(self.X, k)
bic = self.compute_bic(self.X, means, labels, K=k,
R=self.X.shape[0])
bics.append(bic)
diff_bics = np.diff(bics)
finalK = K[-1]
if len(bics) == 1:
finalK = 2
else:
# Normalize
bics = np.asarray(bics)
bics -= bics.min()
#bics /= bics.max()
diff_bics -= diff_bics.min()
#diff_bics /= diff_bics.max()
#print bics, diff_bics
# Find optimum K
for i in xrange(len(K[:-1])):
#if bics[i] > diff_bics[i]:
if diff_bics[i] < th and K[i] != 1:
finalK = K[i]
break
logging.info("Estimated Unique Number of Segments: %d" % finalK)
if self.plot:
plt.subplot(2, 1, 1)
plt.plot(K, bics, label="BIC")
plt.plot(K[:-1], diff_bics, label="BIC diff")
plt.legend(loc=2)
plt.subplot(2, 1, 2)
plt.scatter(self.X[:, 0], self.X[:, 1])
plt.show()
return finalK
def get_clustered_data(self, X, labels, label_index):
"""Returns the data with a specific label_index, using the previously
learned labels."""
D = X[np.argwhere(labels == label_index)]
return D.reshape((D.shape[0], D.shape[-1]))
def run_kmeans(self, X, K):
"""Runs k-means and returns the labels assigned to the data."""
wX = vq.whiten(X)
means, dist = vq.kmeans(wX, K, iter=100)
labels, dist = vq.vq(wX, means)
return means, labels
def compute_bic(self, D, means, labels, K, R):
"""Computes the Bayesian Information Criterion."""
D = vq.whiten(D)
Rn = D.shape[0]
M = D.shape[1]
if R == K:
return 1
# Maximum likelihood estimate (MLE)
mle_var = 0
for k in xrange(len(means)):
X = D[np.argwhere(labels == k)]
X = X.reshape((X.shape[0], X.shape[-1]))
for x in X:
mle_var += distance.euclidean(x, means[k])
#print x, means[k], mle_var
mle_var /= (float(R - K))
# Log-likelihood of the data
l_D = - Rn/2. * np.log(2*np.pi) - (Rn * M)/2. * np.log(mle_var) - \
(Rn - K) / 2. + Rn * np.log(Rn) - Rn * np.log(R)
# Params of BIC
p = (K-1) + M * K + mle_var
#print "BIC:", l_D, p, R, K
# Return the bic
return l_D - p/2. * np.log(R)
@classmethod
def generate_2d_data(self, N=100, K=5):
"""Generates N*K 2D data points with K means and N data points
for each mean."""
# Seed the random
np.random.seed(seed=int(time.time()))
# Amount of spread of the centroids
spread = 30
# Generate random data
X = np.empty((0, 2))
for i in xrange(K):
mean = np.array([np.random.random()*spread,
np.random.random()*spread])
x = np.random.normal(0.0, scale=1.0, size=(N, 2)) + mean
X = np.append(X, x, axis=0)
return X
def test_kmeans(K=5):
"""Test k-means with the synthetic data."""
X = XMeans.generate_2d_data(K=4)
wX = vq.whiten(X)
dic, dist = vq.kmeans(wX, K, iter=100)
plt.scatter(wX[:, 0], wX[:, 1])
plt.scatter(dic[:, 0], dic[:, 1], color="m")
plt.show()
def main(args):
#test_kmeans(6)
X = XMeans.generate_2d_data(K=args.k)
xmeans = XMeans(X, init_K=2, plot=args.plot)
est_K = xmeans.estimate_K_xmeans()
est_K_knee = xmeans.estimate_K_knee()
print "Estimated x-means K:", est_K
print "Estimated Knee Point Detection K:", est_K_knee
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Runs x-means")
parser.add_argument("k",
metavar="k", type=int,
help="Number of clusters to estimate.")
parser.add_argument("-p", action="store_true", default=False,
dest="plot", help="Plot the results")
main(parser.parse_args())