From 8a5615b280d7916a282210ba4d0bc8294019182d Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 8 Oct 2019 09:47:40 -0700 Subject: [PATCH 1/2] [SPARK-29377][PYTHON][ML] Parity between scala ml tuning and python ml tuning --- python/pyspark/ml/tuning.py | 162 ++++++++++++++++++++++++++---------- 1 file changed, 118 insertions(+), 44 deletions(-) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index d80d6e8aaa342..4d4e42f1f6f35 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -122,7 +122,7 @@ def to_key_value_pairs(keys, values): return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)] -class ValidatorParams(HasSeed): +class _ValidatorParams(HasSeed): """ Common params for TrainValidationSplit and CrossValidator. """ @@ -133,36 +133,18 @@ class ValidatorParams(HasSeed): Params._dummy(), "evaluator", "evaluator used to select hyper-parameters that maximize the validator metric") - def setEstimator(self, value): - """ - Sets the value of :py:attr:`estimator`. - """ - return self._set(estimator=value) - def getEstimator(self): """ Gets the value of estimator or its default value. """ return self.getOrDefault(self.estimator) - def setEstimatorParamMaps(self, value): - """ - Sets the value of :py:attr:`estimatorParamMaps`. - """ - return self._set(estimatorParamMaps=value) - def getEstimatorParamMaps(self): """ Gets the value of estimatorParamMaps or its default value. """ return self.getOrDefault(self.estimatorParamMaps) - def setEvaluator(self, value): - """ - Sets the value of :py:attr:`evaluator`. - """ - return self._set(evaluator=value) - def getEvaluator(self): """ Gets the value of evaluator or its default value. @@ -199,7 +181,25 @@ def _to_java_impl(self): return java_estimator, java_epms, java_evaluator -class CrossValidator(Estimator, ValidatorParams, HasParallelism, HasCollectSubModels, +class _CrossValidatorParams(_ValidatorParams): + """ + Params for :py:class:`CrossValidator` and :py:class:`CrossValidatorModel`. + + .. versionadded:: 3.0.0 + """ + + numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation", + typeConverter=TypeConverters.toInt) + + @since("1.4.0") + def getNumFolds(self): + """ + Gets the value of numFolds or its default value. + """ + return self.getOrDefault(self.numFolds) + + +class CrossValidator(Estimator, _CrossValidatorParams, HasParallelism, HasCollectSubModels, MLReadable, MLWritable): """ @@ -226,6 +226,8 @@ class CrossValidator(Estimator, ValidatorParams, HasParallelism, HasCollectSubMo >>> cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, ... parallelism=2) >>> cvModel = cv.fit(dataset) + >>> cvModel.getNumFolds() + 3 >>> cvModel.avgMetrics[0] 0.5 >>> evaluator.evaluate(cvModel.transform(dataset)) @@ -234,9 +236,6 @@ class CrossValidator(Estimator, ValidatorParams, HasParallelism, HasCollectSubMo .. versionadded:: 1.4.0 """ - numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation", - typeConverter=TypeConverters.toInt) - @keyword_only def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, seed=None, parallelism=1, collectSubModels=False): @@ -261,19 +260,30 @@ def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, num kwargs = self._input_kwargs return self._set(**kwargs) - @since("1.4.0") - def setNumFolds(self, value): + def setEstimator(self, value): """ - Sets the value of :py:attr:`numFolds`. + Sets the value of :py:attr:`estimator`. """ - return self._set(numFolds=value) + return self._set(estimator=value) + + def setEstimatorParamMaps(self, value): + """ + Sets the value of :py:attr:`estimatorParamMaps`. + """ + return self._set(estimatorParamMaps=value) + + def setEvaluator(self, value): + """ + Sets the value of :py:attr:`evaluator`. + """ + return self._set(evaluator=value) @since("1.4.0") - def getNumFolds(self): + def setNumFolds(self, value): """ - Gets the value of numFolds or its default value. + Sets the value of :py:attr:`numFolds`. """ - return self.getOrDefault(self.numFolds) + return self._set(numFolds=value) def _fit(self, dataset): est = self.getOrDefault(self.estimator) @@ -387,7 +397,7 @@ def _to_java(self): return _java_obj -class CrossValidatorModel(Model, ValidatorParams, MLReadable, MLWritable): +class CrossValidatorModel(Model, _CrossValidatorParams, MLReadable, MLWritable): """ CrossValidatorModel contains the model with the highest average cross-validation @@ -407,6 +417,24 @@ def __init__(self, bestModel, avgMetrics=[], subModels=None): #: sub model list from cross validation self.subModels = subModels + def setEstimator(self, value): + """ + Sets the value of :py:attr:`estimator`. + """ + return self._set(estimator=value) + + def setEstimatorParamMaps(self, value): + """ + Sets the value of :py:attr:`estimatorParamMaps`. + """ + return self._set(estimatorParamMaps=value) + + def setEvaluator(self, value): + """ + Sets the value of :py:attr:`evaluator`. + """ + return self._set(evaluator=value) + def _transform(self, dataset): return self.bestModel.transform(dataset) @@ -486,8 +514,26 @@ def _to_java(self): return _java_obj -class TrainValidationSplit(Estimator, ValidatorParams, HasParallelism, HasCollectSubModels, - MLReadable, MLWritable): +class _TrainValidationSplitParams(_ValidatorParams): + """ + Params for :py:class:`TrainValidationSplit` and :py:class:`TrainValidationSplitModel`. + + .. versionadded:: 3.0.0 + """ + + trainRatio = Param(Params._dummy(), "trainRatio", "Param for ratio between train and\ + validation data. Must be between 0 and 1.", typeConverter=TypeConverters.toFloat) + + @since("2.0.0") + def getTrainRatio(self): + """ + Gets the value of trainRatio or its default value. + """ + return self.getOrDefault(self.trainRatio) + + +class TrainValidationSplit(Estimator, _TrainValidationSplitParams, HasParallelism, + HasCollectSubModels, MLReadable, MLWritable): """ Validation for hyper-parameter tuning. Randomly splits the input dataset into train and validation sets, and uses evaluation metric on the validation set to select the best model. @@ -509,15 +555,14 @@ class TrainValidationSplit(Estimator, ValidatorParams, HasParallelism, HasCollec >>> tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, ... parallelism=1, seed=42) >>> tvsModel = tvs.fit(dataset) + >>> tvsModel.getTrainRatio() + 0.75 >>> evaluator.evaluate(tvsModel.transform(dataset)) 0.833... .. versionadded:: 2.0.0 """ - trainRatio = Param(Params._dummy(), "trainRatio", "Param for ratio between train and\ - validation data. Must be between 0 and 1.", typeConverter=TypeConverters.toFloat) - @keyword_only def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75, parallelism=1, collectSubModels=False, seed=None): @@ -542,19 +587,30 @@ def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, tra kwargs = self._input_kwargs return self._set(**kwargs) - @since("2.0.0") - def setTrainRatio(self, value): + def setEstimator(self, value): """ - Sets the value of :py:attr:`trainRatio`. + Sets the value of :py:attr:`estimator`. """ - return self._set(trainRatio=value) + return self._set(estimator=value) + + def setEstimatorParamMaps(self, value): + """ + Sets the value of :py:attr:`estimatorParamMaps`. + """ + return self._set(estimatorParamMaps=value) + + def setEvaluator(self, value): + """ + Sets the value of :py:attr:`evaluator`. + """ + return self._set(evaluator=value) @since("2.0.0") - def getTrainRatio(self): + def setTrainRatio(self, value): """ - Gets the value of trainRatio or its default value. + Sets the value of :py:attr:`trainRatio`. """ - return self.getOrDefault(self.trainRatio) + return self._set(trainRatio=value) def _fit(self, dataset): est = self.getOrDefault(self.estimator) @@ -662,7 +718,7 @@ def _to_java(self): return _java_obj -class TrainValidationSplitModel(Model, ValidatorParams, MLReadable, MLWritable): +class TrainValidationSplitModel(Model, _TrainValidationSplitParams, MLReadable, MLWritable): """ Model from train validation split. @@ -678,6 +734,24 @@ def __init__(self, bestModel, validationMetrics=[], subModels=None): #: sub models from train validation split self.subModels = subModels + def setEstimator(self, value): + """ + Sets the value of :py:attr:`estimator`. + """ + return self._set(estimator=value) + + def setEstimatorParamMaps(self, value): + """ + Sets the value of :py:attr:`estimatorParamMaps`. + """ + return self._set(estimatorParamMaps=value) + + def setEvaluator(self, value): + """ + Sets the value of :py:attr:`evaluator`. + """ + return self._set(evaluator=value) + def _transform(self, dataset): return self.bestModel.transform(dataset) From 2a23e08654d9c8b36f5ee38d58186067e3cf64cd Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Sun, 13 Oct 2019 10:16:36 -0700 Subject: [PATCH 2/2] add version info in getters/setters --- python/pyspark/ml/tuning.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 4d4e42f1f6f35..8fa0183e4683d 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -133,18 +133,21 @@ class _ValidatorParams(HasSeed): Params._dummy(), "evaluator", "evaluator used to select hyper-parameters that maximize the validator metric") + @since("2.0.0") def getEstimator(self): """ Gets the value of estimator or its default value. """ return self.getOrDefault(self.estimator) + @since("2.0.0") def getEstimatorParamMaps(self): """ Gets the value of estimatorParamMaps or its default value. """ return self.getOrDefault(self.estimatorParamMaps) + @since("2.0.0") def getEvaluator(self): """ Gets the value of evaluator or its default value. @@ -260,18 +263,21 @@ def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, num kwargs = self._input_kwargs return self._set(**kwargs) + @since("2.0.0") def setEstimator(self, value): """ Sets the value of :py:attr:`estimator`. """ return self._set(estimator=value) + @since("2.0.0") def setEstimatorParamMaps(self, value): """ Sets the value of :py:attr:`estimatorParamMaps`. """ return self._set(estimatorParamMaps=value) + @since("2.0.0") def setEvaluator(self, value): """ Sets the value of :py:attr:`evaluator`. @@ -417,18 +423,21 @@ def __init__(self, bestModel, avgMetrics=[], subModels=None): #: sub model list from cross validation self.subModels = subModels + @since("2.0.0") def setEstimator(self, value): """ Sets the value of :py:attr:`estimator`. """ return self._set(estimator=value) + @since("2.0.0") def setEstimatorParamMaps(self, value): """ Sets the value of :py:attr:`estimatorParamMaps`. """ return self._set(estimatorParamMaps=value) + @since("2.0.0") def setEvaluator(self, value): """ Sets the value of :py:attr:`evaluator`. @@ -587,18 +596,21 @@ def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, tra kwargs = self._input_kwargs return self._set(**kwargs) + @since("2.0.0") def setEstimator(self, value): """ Sets the value of :py:attr:`estimator`. """ return self._set(estimator=value) + @since("2.0.0") def setEstimatorParamMaps(self, value): """ Sets the value of :py:attr:`estimatorParamMaps`. """ return self._set(estimatorParamMaps=value) + @since("2.0.0") def setEvaluator(self, value): """ Sets the value of :py:attr:`evaluator`. @@ -734,18 +746,21 @@ def __init__(self, bestModel, validationMetrics=[], subModels=None): #: sub models from train validation split self.subModels = subModels + @since("2.0.0") def setEstimator(self, value): """ Sets the value of :py:attr:`estimator`. """ return self._set(estimator=value) + @since("2.0.0") def setEstimatorParamMaps(self, value): """ Sets the value of :py:attr:`estimatorParamMaps`. """ return self._set(estimatorParamMaps=value) + @since("2.0.0") def setEvaluator(self, value): """ Sets the value of :py:attr:`evaluator`.