Skip to content

Commit

Permalink
[SPARK-23344][PYTHON][ML] Add distanceMeasure param to KMeans
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

SPARK-22119 introduced a new parameter for KMeans, ie. `distanceMeasure`. The PR adds it also to the Python interface.

## How was this patch tested?

added UTs

Author: Marco Gaido <[email protected]>

Closes #20520 from mgaido91/SPARK-23344.
  • Loading branch information
mgaido91 authored and srowen committed Feb 10, 2018
1 parent 97a224a commit 0783876
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 5 deletions.
32 changes: 27 additions & 5 deletions python/pyspark/ml/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,17 +403,23 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
typeConverter=TypeConverters.toString)
initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " +
"initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt)
distanceMeasure = Param(Params._dummy(), "distanceMeasure", "The distance measure. " +
"Supported options: 'euclidean' and 'cosine'.",
typeConverter=TypeConverters.toString)

@keyword_only
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None):
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None,
distanceMeasure="euclidean"):
"""
__init__(self, featuresCol="features", predictionCol="prediction", k=2, \
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None)
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, \
distanceMeasure="euclidean")
"""
super(KMeans, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid)
self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20)
self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20,
distanceMeasure="euclidean")
kwargs = self._input_kwargs
self.setParams(**kwargs)

Expand All @@ -423,10 +429,12 @@ def _create_model(self, java_model):
@keyword_only
@since("1.5.0")
def setParams(self, featuresCol="features", predictionCol="prediction", k=2,
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None):
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None,
distanceMeasure="euclidean"):
"""
setParams(self, featuresCol="features", predictionCol="prediction", k=2, \
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None)
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, \
distanceMeasure="euclidean")
Sets params for KMeans.
"""
Expand Down Expand Up @@ -475,6 +483,20 @@ def getInitSteps(self):
"""
return self.getOrDefault(self.initSteps)

@since("2.4.0")
def setDistanceMeasure(self, value):
"""
Sets the value of :py:attr:`distanceMeasure`.
"""
return self._set(distanceMeasure=value)

@since("2.4.0")
def getDistanceMeasure(self):
"""
Gets the value of `distanceMeasure`
"""
return self.getOrDefault(self.distanceMeasure)


class BisectingKMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
"""
Expand Down
18 changes: 18 additions & 0 deletions python/pyspark/ml/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,9 @@ def test_kmeans_param(self):
self.assertEqual(algo.getK(), 10)
algo.setInitSteps(10)
self.assertEqual(algo.getInitSteps(), 10)
self.assertEqual(algo.getDistanceMeasure(), "euclidean")
algo.setDistanceMeasure("cosine")
self.assertEqual(algo.getDistanceMeasure(), "cosine")

def test_hasseed(self):
noSeedSpecd = TestParams()
Expand Down Expand Up @@ -1620,6 +1623,21 @@ def test_kmeans_summary(self):
self.assertEqual(s.k, 2)


class KMeansTests(SparkSessionTestCase):

def test_kmeans_cosine_distance(self):
data = [(Vectors.dense([1.0, 1.0]),), (Vectors.dense([10.0, 10.0]),),
(Vectors.dense([1.0, 0.5]),), (Vectors.dense([10.0, 4.4]),),
(Vectors.dense([-1.0, 1.0]),), (Vectors.dense([-100.0, 90.0]),)]
df = self.spark.createDataFrame(data, ["features"])
kmeans = KMeans(k=3, seed=1, distanceMeasure="cosine")
model = kmeans.fit(df)
result = model.transform(df).collect()
self.assertTrue(result[0].prediction == result[1].prediction)
self.assertTrue(result[2].prediction == result[3].prediction)
self.assertTrue(result[4].prediction == result[5].prediction)


class OneVsRestTests(SparkSessionTestCase):

def test_copy(self):
Expand Down

0 comments on commit 0783876

Please sign in to comment.