Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-24439][ML][PYTHON]Add distanceMeasure to BisectingKMeans in PySpark #21557

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 24 additions & 11 deletions python/pyspark/ml/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,8 +349,8 @@ def summary(self):


@inherit_doc
class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed,
JavaMLWritable, JavaMLReadable):
class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol, HasMaxIter,
HasTol, HasSeed, JavaMLWritable, JavaMLReadable):
"""
K-means clustering with a k-means++ like initialization mode
(the k-means|| algorithm by Bahmani et al).
Expand Down Expand Up @@ -406,9 +406,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
typeConverter=TypeConverters.toString)
initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " +
"initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt)
distanceMeasure = Param(Params._dummy(), "distanceMeasure", "The distance measure. " +
"Supported options: 'euclidean' and 'cosine'.",
typeConverter=TypeConverters.toString)

@keyword_only
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
Expand Down Expand Up @@ -544,8 +541,8 @@ def summary(self):


@inherit_doc
class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasSeed,
JavaMLWritable, JavaMLReadable):
class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol,
HasMaxIter, HasSeed, JavaMLWritable, JavaMLReadable):
"""
A bisecting k-means algorithm based on the paper "A comparison of document clustering
techniques" by Steinbach, Karypis, and Kumar, with modification to fit Spark.
Expand Down Expand Up @@ -585,6 +582,8 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
>>> bkm2 = BisectingKMeans.load(bkm_path)
>>> bkm2.getK()
2
>>> bkm2.getDistanceMeasure()
'euclidean'
>>> model_path = temp_path + "/bkm_model"
>>> model.save(model_path)
>>> model2 = BisectingKMeansModel.load(model_path)
Expand All @@ -607,10 +606,10 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte

@keyword_only
def __init__(self, featuresCol="features", predictionCol="prediction", maxIter=20,
seed=None, k=4, minDivisibleClusterSize=1.0):
seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"):
"""
__init__(self, featuresCol="features", predictionCol="prediction", maxIter=20, \
seed=None, k=4, minDivisibleClusterSize=1.0)
seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean")
"""
super(BisectingKMeans, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.BisectingKMeans",
Expand All @@ -622,10 +621,10 @@ def __init__(self, featuresCol="features", predictionCol="prediction", maxIter=2
@keyword_only
@since("2.0.0")
def setParams(self, featuresCol="features", predictionCol="prediction", maxIter=20,
seed=None, k=4, minDivisibleClusterSize=1.0):
seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"):
"""
setParams(self, featuresCol="features", predictionCol="prediction", maxIter=20, \
seed=None, k=4, minDivisibleClusterSize=1.0)
seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean")
Sets params for BisectingKMeans.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know we already have setDistanceMeasure and getDistanceMeasure methods from the shared param, but can you also add them here so we can use the since decorator? (same as KMeans)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@BryanCutler Thank you very much for your review. I will make change.

"""
kwargs = self._input_kwargs
Expand Down Expand Up @@ -659,6 +658,20 @@ def getMinDivisibleClusterSize(self):
"""
return self.getOrDefault(self.minDivisibleClusterSize)

@since("2.4.0")
def setDistanceMeasure(self, value):
"""
Sets the value of :py:attr:`distanceMeasure`.
"""
return self._set(distanceMeasure=value)

@since("2.4.0")
def getDistanceMeasure(self):
"""
Gets the value of `distanceMeasure` or its default value.
"""
return self.getOrDefault(self.distanceMeasure)

def _create_model(self, java_model):
return BisectingKMeansModel(java_model)

Expand Down
4 changes: 3 additions & 1 deletion python/pyspark/ml/param/_shared_params_code_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ def get$Name(self):
"fitting. If set to true, then all sub-models will be available. Warning: For large " +
"models, collecting all sub-models can cause OOMs on the Spark driver.",
"False", "TypeConverters.toBoolean"),
("loss", "the loss function to be optimized.", None, "TypeConverters.toString")]
("loss", "the loss function to be optimized.", None, "TypeConverters.toString"),
("distanceMeasure", "the distance measure. Supported options: 'euclidean' and 'cosine'.",
"'euclidean'", "TypeConverters.toString")]

code = []
for name, doc, defaultValueStr, typeConverter in shared:
Expand Down
24 changes: 24 additions & 0 deletions python/pyspark/ml/param/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,3 +790,27 @@ def getCacheNodeIds(self):
"""
return self.getOrDefault(self.cacheNodeIds)


class HasDistanceMeasure(Params):
"""
Mixin for param distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'.
"""

distanceMeasure = Param(Params._dummy(), "distanceMeasure", "the distance measure. Supported options: 'euclidean' and 'cosine'.", typeConverter=TypeConverters.toString)

def __init__(self):
super(HasDistanceMeasure, self).__init__()
self._setDefault(distanceMeasure='euclidean')

def setDistanceMeasure(self, value):
"""
Sets the value of :py:attr:`distanceMeasure`.
"""
return self._set(distanceMeasure=value)

def getDistanceMeasure(self):
"""
Gets the value of distanceMeasure or its default value.
"""
return self.getOrDefault(self.distanceMeasure)