Skip to content

Commit

Permalink
Python Kmeans - setEpsilon, setInitializationSteps, k and computeCost…
Browse files Browse the repository at this point in the history
… added.
  • Loading branch information
FlytxtRnD committed May 5, 2015
1 parent c5790a2 commit 21eb84c
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 5 deletions.
1 change: 1 addition & 0 deletions examples/src/main/python/mllib/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,5 @@ def parseVector(line):
k = int(sys.argv[2])
model = KMeans.train(data, k)
print("Final centers: " + str(model.clusterCenters))
print("Total Cost: " + str(model.computeCost(data)))
sc.stop()
Original file line number Diff line number Diff line change
Expand Up @@ -291,12 +291,16 @@ private[python] class PythonMLLibAPI extends Serializable {
maxIterations: Int,
runs: Int,
initializationMode: String,
seed: java.lang.Long): KMeansModel = {
seed: java.lang.Long,
initializationSteps: Int,
epsilon: Double): KMeansModel = {
val kMeansAlg = new KMeans()
.setK(k)
.setMaxIterations(maxIterations)
.setRuns(runs)
.setInitializationMode(initializationMode)
.setInitializationSteps(initializationSteps)
.setEpsilon(epsilon)

if (seed != null) kMeansAlg.setSeed(seed)

Expand All @@ -307,6 +311,15 @@ private[python] class PythonMLLibAPI extends Serializable {
}
}

/**
* Java stub for Python mllib KMeansModel.computeCost()
*/
def computeCostKmeansModel(
data: JavaRDD[Vector],
centers: java.util.ArrayList[Vector]): Double = {
new KMeansModel(centers).computeCost(data)
}

/**
* Java stub for Python mllib GaussianMixture.run()
* Returns a list containing weights, mean and covariance of each mixture component.
Expand Down
27 changes: 23 additions & 4 deletions python/pyspark/mllib/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,25 @@ class KMeansModel(Saveable, Loader):
>>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)
>>> model = KMeans.train(
... sc.parallelize(data), 2, maxIterations=10, runs=30, initializationMode="random")
... sc.parallelize(data), 2, maxIterations=10, runs=30, initializationMode="random",
... initializationSteps=5, epsilon=1e-4, seed=None)
>>> model.predict(array([0.0, 0.0])) == model.predict(array([1.0, 1.0]))
True
>>> model.predict(array([8.0, 9.0])) == model.predict(array([9.0, 8.0]))
True
>>> model.k
2
>>> model.computeCost(sc.parallelize(data))
2.0000000000000004
>>> model = KMeans.train(sc.parallelize(data), 2)
>>> sparse_data = [
... SparseVector(3, {1: 1.0}),
... SparseVector(3, {1: 1.1}),
... SparseVector(3, {2: 1.0}),
... SparseVector(3, {2: 1.1})
... ]
>>> model = KMeans.train(sc.parallelize(sparse_data), 2, initializationMode="k-means||")
>>> model = KMeans.train(sc.parallelize(sparse_data), 2, initializationMode="k-means||",
... initializationSteps=5, epsilon=1e-4, seed=None)
>>> model.predict(array([0., 1., 0.])) == model.predict(array([0, 1.1, 0.]))
True
>>> model.predict(array([0., 0., 1.])) == model.predict(array([0, 0, 1.1]))
Expand Down Expand Up @@ -83,6 +89,11 @@ def clusterCenters(self):
"""Get the cluster centers, represented as a list of NumPy arrays."""
return self.centers

@property
def k(self):
"""Total number of clusters."""
return len(self.centers)

def predict(self, x):
"""Find the cluster to which x belongs in this model."""
best = 0
Expand All @@ -95,6 +106,13 @@ def predict(self, x):
best_distance = distance
return best

def computeCost(self, rdd):
"""Return the K-means cost (sum of squared distances of points to their nearest center) for this
model on the given data."""
cost = callMLlibFunc("computeCostKmeansModel", rdd.map(_convert_to_vector),
map(_convert_to_vector, self.centers))
return cost

def save(self, sc, path):
java_centers = _py2java(sc, [_convert_to_vector(c) for c in self.centers])
java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers)
Expand All @@ -109,10 +127,11 @@ def load(cls, sc, path):
class KMeans(object):

@classmethod
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||", seed=None):
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",
seed=None, initializationSteps=5, epsilon=1e-4):
"""Train a k-means clustering model."""
model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations,
runs, initializationMode, seed)
runs, initializationMode, seed, initializationSteps, epsilon)
centers = callJavaFunc(rdd.context, model.clusterCenters)
return KMeansModel([c.toArray() for c in centers])

Expand Down

0 comments on commit 21eb84c

Please sign in to comment.