-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKmeans.py
116 lines (101 loc) · 2.89 KB
/
Kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import math
import random
import numpy
import sys
#for
class Kmeans(object):
def __init__(self, distanceFunction):
self.distance = distanceFunction
def cluster(self, k, dataPoints):
centroids = random.sample(dataPoints, k)
# Initial cluster configuration does not matter.
clusters = [[] for c in range(k)]
clusters[0] = dataPoints
clusters = self.kmeansIteration(centroids, clusters, 30)
return clusters
@staticmethod
def flatten(clusters):
result = []
for cluster in clusters:
for point in cluster:
result.append(point)
return result
def kmeansIteration(self, centroids, clusters, iterationsLeft):
print "centroids",centroids
print "clusters",clusters
dataPoints = Kmeans.flatten(clusters)
print "dataPoints", dataPoints
k = len(clusters)
newClusters = [[] for c in range(k)]
print dataPoints
for dataPoint in dataPoints:
minimalDistance = sys.maxint
closestCentroid = None
for clusterId in range(k):
print "clusterId", clusterId
print "dataPoint", dataPoint
print "centroid", centroids[clusterId]
print "------------------------------------------------"
distance = self.distance(centroids[clusterId], dataPoint)
if distance < minimalDistance:
minimalDistance = distance
closestCentroid = clusterId
newClusters[closestCentroid].append(dataPoint)
print "newClusters", newClusters
for i in range(k):
if (Kmeans.areListsEqual(clusters[i], newClusters[i]) or
iterationsLeft == 0):
return newClusters
centroids = Kmeans.calculateNewCentroids(newClusters)
return self.kmeansIteration(centroids, newClusters, iterationsLeft-1)
@staticmethod
def areListsEqual(list1, list2):
print "list1", list1
print "list2", list2
if(not(len(list1) == len(list2))):
return False
for i in range(len(list1)):
if(not(list1[i] == list2[i])):
return False
return True
@staticmethod
def calculateNewCentroids(clusters):
print "old: ", clusters
k = len(clusters)
print "k:", k
centroids = [None] * k
print "centroids:", centroids
dimensions = len(Kmeans.flatten(clusters)[0])
print "dimensions:", dimensions
for clusterId in range(k):
tot = [0]* dimensions
for point in clusters[clusterId]:
tot = map(sum, zip(tot,point))
print "tot:", tot
centroids[clusterId] = [tot[i]/float(k) for i in range(dimensions)]
print "centroid:", clusterId, centroids[clusterId]
print "centroids: ", clusters
return centroids
testData = [
[1, 0, 0],
[2, 0, 0],
[3, 0, 0],
[4, 0, 0],
[3, 0, 0],
[2, 0, 0],
[0, 0, 2],
[0, 0, 3],
[0, 0, 4],
[0, 0, 5],
[0, 0, 1]]
class EuclideanDistance(object):
def __init__(self):
return
def __call__(self, p1, p2):
dimensions = len(p1)
return (math.sqrt(sum([(p1[i]-p2[i])**2 for i in range(dimensions)])))
kmeans = Kmeans(EuclideanDistance())
result = kmeans.cluster(2, testData)
print "*############################*"
print result[0]
print result[1]