From 30c4b2477ee1ddd61f85a901111aa7ce44d81823 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 16 Oct 2019 09:05:00 -0700 Subject: [PATCH] [SPARK-29381][FOLLOWUP][PYTHON][ML] Add 'private' _XXXParams classes for classification & regression --- python/pyspark/ml/classification.py | 501 +++++++++++++++------------- python/pyspark/ml/regression.py | 216 ++++++------ 2 files changed, 392 insertions(+), 325 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index c08705829c2f1..c1db26af5982d 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -149,10 +149,23 @@ def setThresholds(self, value): return self._set(thresholds=value) +class _LinearSVCParams(_JavaClassifierParams, HasRegParam, HasMaxIter, HasFitIntercept, HasTol, + HasStandardization, HasWeightCol, HasAggregationDepth, HasThreshold): + """ + Params for :py:class:`LinearSVC` and :py:class:`LinearSVCModel`. + + .. versionadded:: 3.0.0 + """ + + threshold = Param(Params._dummy(), "threshold", + "The threshold in binary classification applied to the linear model" + " prediction. This threshold can be any real number, where Inf will make" + " all predictions 0.0 and -Inf will make all predictions 1.0.", + typeConverter=TypeConverters.toFloat) + + @inherit_doc -class LinearSVC(JavaClassifier, HasMaxIter, HasRegParam, HasTol, - HasFitIntercept, HasStandardization, HasWeightCol, HasAggregationDepth, - HasThreshold, JavaMLWritable, JavaMLReadable): +class LinearSVC(JavaClassifier, _LinearSVCParams, JavaMLWritable, JavaMLReadable): """ `Linear SVM Classifier `_ @@ -170,6 +183,10 @@ class LinearSVC(JavaClassifier, HasMaxIter, HasRegParam, HasTol, LinearSVC... >>> model.getPredictionCol() 'newPrediction' + >>> model.setThreshold(0.5) + LinearSVC... + >>> model.getThreshold() + 0.5 >>> model.coefficients DenseVector([0.0, -0.2792, -0.1833]) >>> model.intercept @@ -202,12 +219,6 @@ class LinearSVC(JavaClassifier, HasMaxIter, HasRegParam, HasTol, .. versionadded:: 2.2.0 """ - threshold = Param(Params._dummy(), "threshold", - "The threshold in binary classification applied to the linear model" - " prediction. This threshold can be any real number, where Inf will make" - " all predictions 0.0 and -Inf will make all predictions 1.0.", - typeConverter=TypeConverters.toFloat) - @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", @@ -247,7 +258,7 @@ def _create_model(self, java_model): return LinearSVCModel(java_model) -class LinearSVCModel(JavaClassificationModel, JavaMLWritable, JavaMLReadable): +class LinearSVCModel(JavaClassificationModel, _LinearSVCParams, JavaMLWritable, JavaMLReadable): """ Model fitted by LinearSVC. @@ -271,10 +282,167 @@ def intercept(self): return self._call_java("intercept") +class _LogisticRegressionParams(_JavaProbabilisticClassifierParams, HasRegParam, + HasElasticNetParam, HasMaxIter, HasFitIntercept, HasTol, + HasStandardization, HasWeightCol, HasAggregationDepth, + HasThreshold): + """ + Params for :py:class:`LogisticRegression` and :py:class:`LogisticRegressionModel`. + + .. versionadded:: 3.0.0 + """ + + threshold = Param(Params._dummy(), "threshold", + "Threshold in binary classification prediction, in range [0, 1]." + + " If threshold and thresholds are both set, they must match." + + "e.g. if threshold is p, then thresholds must be equal to [1-p, p].", + typeConverter=TypeConverters.toFloat) + + family = Param(Params._dummy(), "family", + "The name of family which is a description of the label distribution to " + + "be used in the model. Supported options: auto, binomial, multinomial", + typeConverter=TypeConverters.toString) + + lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients", + "The lower bounds on coefficients if fitting under bound " + "constrained optimization. The bound matrix must be " + "compatible with the shape " + "(1, number of features) for binomial regression, or " + "(number of classes, number of features) " + "for multinomial regression.", + typeConverter=TypeConverters.toMatrix) + + upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients", + "The upper bounds on coefficients if fitting under bound " + "constrained optimization. The bound matrix must be " + "compatible with the shape " + "(1, number of features) for binomial regression, or " + "(number of classes, number of features) " + "for multinomial regression.", + typeConverter=TypeConverters.toMatrix) + + lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts", + "The lower bounds on intercepts if fitting under bound " + "constrained optimization. The bounds vector size must be" + "equal with 1 for binomial regression, or the number of" + "lasses for multinomial regression.", + typeConverter=TypeConverters.toVector) + + upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts", + "The upper bounds on intercepts if fitting under bound " + "constrained optimization. The bound vector size must be " + "equal with 1 for binomial regression, or the number of " + "classes for multinomial regression.", + typeConverter=TypeConverters.toVector) + + @since("1.4.0") + def setThreshold(self, value): + """ + Sets the value of :py:attr:`threshold`. + Clears value of :py:attr:`thresholds` if it has been set. + """ + self._set(threshold=value) + self._clear(self.thresholds) + return self + + @since("1.4.0") + def getThreshold(self): + """ + Get threshold for binary classification. + + If :py:attr:`thresholds` is set with length 2 (i.e., binary classification), + this returns the equivalent threshold: + :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. + Otherwise, returns :py:attr:`threshold` if set or its default value if unset. + """ + self._checkThresholdConsistency() + if self.isSet(self.thresholds): + ts = self.getOrDefault(self.thresholds) + if len(ts) != 2: + raise ValueError("Logistic Regression getThreshold only applies to" + + " binary classification, but thresholds has length != 2." + + " thresholds: " + ",".join(ts)) + return 1.0/(1.0 + ts[0]/ts[1]) + else: + return self.getOrDefault(self.threshold) + + @since("1.5.0") + def setThresholds(self, value): + """ + Sets the value of :py:attr:`thresholds`. + Clears value of :py:attr:`threshold` if it has been set. + """ + self._set(thresholds=value) + self._clear(self.threshold) + return self + + @since("1.5.0") + def getThresholds(self): + """ + If :py:attr:`thresholds` is set, return its value. + Otherwise, if :py:attr:`threshold` is set, return the equivalent thresholds for binary + classification: (1-threshold, threshold). + If neither are set, throw an error. + """ + self._checkThresholdConsistency() + if not self.isSet(self.thresholds) and self.isSet(self.threshold): + t = self.getOrDefault(self.threshold) + return [1.0-t, t] + else: + return self.getOrDefault(self.thresholds) + + def _checkThresholdConsistency(self): + if self.isSet(self.threshold) and self.isSet(self.thresholds): + ts = self.getOrDefault(self.thresholds) + if len(ts) != 2: + raise ValueError("Logistic Regression getThreshold only applies to" + + " binary classification, but thresholds has length != 2." + + " thresholds: {0}".format(str(ts))) + t = 1.0/(1.0 + ts[0]/ts[1]) + t2 = self.getOrDefault(self.threshold) + if abs(t2 - t) >= 1E-5: + raise ValueError("Logistic Regression getThreshold found inconsistent values for" + + " threshold (%g) and thresholds (equivalent to %g)" % (t2, t)) + + @since("2.1.0") + def getFamily(self): + """ + Gets the value of :py:attr:`family` or its default value. + """ + return self.getOrDefault(self.family) + + @since("2.3.0") + def getLowerBoundsOnCoefficients(self): + """ + Gets the value of :py:attr:`lowerBoundsOnCoefficients` + """ + return self.getOrDefault(self.lowerBoundsOnCoefficients) + + @since("2.3.0") + def getUpperBoundsOnCoefficients(self): + """ + Gets the value of :py:attr:`upperBoundsOnCoefficients` + """ + return self.getOrDefault(self.upperBoundsOnCoefficients) + + @since("2.3.0") + def getLowerBoundsOnIntercepts(self): + """ + Gets the value of :py:attr:`lowerBoundsOnIntercepts` + """ + return self.getOrDefault(self.lowerBoundsOnIntercepts) + + @since("2.3.0") + def getUpperBoundsOnIntercepts(self): + """ + Gets the value of :py:attr:`upperBoundsOnIntercepts` + """ + return self.getOrDefault(self.upperBoundsOnIntercepts) + + @inherit_doc -class LogisticRegression(JavaProbabilisticClassifier, HasMaxIter, HasRegParam, HasTol, - HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds, - HasWeightCol, HasAggregationDepth, JavaMLWritable, JavaMLReadable): +class LogisticRegression(JavaProbabilisticClassifier, _LogisticRegressionParams, JavaMLWritable, + JavaMLReadable): """ Logistic regression. This class supports multinomial logistic (softmax) and binomial logistic regression. @@ -294,6 +462,10 @@ class LogisticRegression(JavaProbabilisticClassifier, HasMaxIter, HasRegParam, H LogisticRegressionModel... >>> blorModel.getProbabilityCol() 'newProbability' + >>> blorModel.setThreshold(0.1) + LogisticRegressionModel... + >>> blorModel.getThreshold() + 0.1 >>> blorModel.coefficients DenseVector([-1.080..., -0.646...]) >>> blorModel.intercept @@ -341,49 +513,6 @@ class LogisticRegression(JavaProbabilisticClassifier, HasMaxIter, HasRegParam, H .. versionadded:: 1.3.0 """ - threshold = Param(Params._dummy(), "threshold", - "Threshold in binary classification prediction, in range [0, 1]." + - " If threshold and thresholds are both set, they must match." + - "e.g. if threshold is p, then thresholds must be equal to [1-p, p].", - typeConverter=TypeConverters.toFloat) - - family = Param(Params._dummy(), "family", - "The name of family which is a description of the label distribution to " + - "be used in the model. Supported options: auto, binomial, multinomial", - typeConverter=TypeConverters.toString) - - lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients", - "The lower bounds on coefficients if fitting under bound " - "constrained optimization. The bound matrix must be " - "compatible with the shape " - "(1, number of features) for binomial regression, or " - "(number of classes, number of features) " - "for multinomial regression.", - typeConverter=TypeConverters.toMatrix) - - upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients", - "The upper bounds on coefficients if fitting under bound " - "constrained optimization. The bound matrix must be " - "compatible with the shape " - "(1, number of features) for binomial regression, or " - "(number of classes, number of features) " - "for multinomial regression.", - typeConverter=TypeConverters.toMatrix) - - lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts", - "The lower bounds on intercepts if fitting under bound " - "constrained optimization. The bounds vector size must be" - "equal with 1 for binomial regression, or the number of" - "lasses for multinomial regression.", - typeConverter=TypeConverters.toVector) - - upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts", - "The upper bounds on intercepts if fitting under bound " - "constrained optimization. The bound vector size must be " - "equal with 1 for binomial regression, or the number of " - "classes for multinomial regression.", - typeConverter=TypeConverters.toVector) - @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, @@ -439,75 +568,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre def _create_model(self, java_model): return LogisticRegressionModel(java_model) - @since("1.4.0") - def setThreshold(self, value): - """ - Sets the value of :py:attr:`threshold`. - Clears value of :py:attr:`thresholds` if it has been set. - """ - self._set(threshold=value) - self._clear(self.thresholds) - return self - - @since("1.4.0") - def getThreshold(self): - """ - Get threshold for binary classification. - - If :py:attr:`thresholds` is set with length 2 (i.e., binary classification), - this returns the equivalent threshold: - :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. - Otherwise, returns :py:attr:`threshold` if set or its default value if unset. - """ - self._checkThresholdConsistency() - if self.isSet(self.thresholds): - ts = self.getOrDefault(self.thresholds) - if len(ts) != 2: - raise ValueError("Logistic Regression getThreshold only applies to" + - " binary classification, but thresholds has length != 2." + - " thresholds: " + ",".join(ts)) - return 1.0/(1.0 + ts[0]/ts[1]) - else: - return self.getOrDefault(self.threshold) - - @since("1.5.0") - def setThresholds(self, value): - """ - Sets the value of :py:attr:`thresholds`. - Clears value of :py:attr:`threshold` if it has been set. - """ - self._set(thresholds=value) - self._clear(self.threshold) - return self - - @since("1.5.0") - def getThresholds(self): - """ - If :py:attr:`thresholds` is set, return its value. - Otherwise, if :py:attr:`threshold` is set, return the equivalent thresholds for binary - classification: (1-threshold, threshold). - If neither are set, throw an error. - """ - self._checkThresholdConsistency() - if not self.isSet(self.thresholds) and self.isSet(self.threshold): - t = self.getOrDefault(self.threshold) - return [1.0-t, t] - else: - return self.getOrDefault(self.thresholds) - - def _checkThresholdConsistency(self): - if self.isSet(self.threshold) and self.isSet(self.thresholds): - ts = self.getOrDefault(self.thresholds) - if len(ts) != 2: - raise ValueError("Logistic Regression getThreshold only applies to" + - " binary classification, but thresholds has length != 2." + - " thresholds: {0}".format(str(ts))) - t = 1.0/(1.0 + ts[0]/ts[1]) - t2 = self.getOrDefault(self.threshold) - if abs(t2 - t) >= 1E-5: - raise ValueError("Logistic Regression getThreshold found inconsistent values for" + - " threshold (%g) and thresholds (equivalent to %g)" % (t2, t)) - @since("2.1.0") def setFamily(self, value): """ @@ -515,13 +575,6 @@ def setFamily(self, value): """ return self._set(family=value) - @since("2.1.0") - def getFamily(self): - """ - Gets the value of :py:attr:`family` or its default value. - """ - return self.getOrDefault(self.family) - @since("2.3.0") def setLowerBoundsOnCoefficients(self, value): """ @@ -529,13 +582,6 @@ def setLowerBoundsOnCoefficients(self, value): """ return self._set(lowerBoundsOnCoefficients=value) - @since("2.3.0") - def getLowerBoundsOnCoefficients(self): - """ - Gets the value of :py:attr:`lowerBoundsOnCoefficients` - """ - return self.getOrDefault(self.lowerBoundsOnCoefficients) - @since("2.3.0") def setUpperBoundsOnCoefficients(self, value): """ @@ -543,13 +589,6 @@ def setUpperBoundsOnCoefficients(self, value): """ return self._set(upperBoundsOnCoefficients=value) - @since("2.3.0") - def getUpperBoundsOnCoefficients(self): - """ - Gets the value of :py:attr:`upperBoundsOnCoefficients` - """ - return self.getOrDefault(self.upperBoundsOnCoefficients) - @since("2.3.0") def setLowerBoundsOnIntercepts(self, value): """ @@ -557,13 +596,6 @@ def setLowerBoundsOnIntercepts(self, value): """ return self._set(lowerBoundsOnIntercepts=value) - @since("2.3.0") - def getLowerBoundsOnIntercepts(self): - """ - Gets the value of :py:attr:`lowerBoundsOnIntercepts` - """ - return self.getOrDefault(self.lowerBoundsOnIntercepts) - @since("2.3.0") def setUpperBoundsOnIntercepts(self, value): """ @@ -571,16 +603,9 @@ def setUpperBoundsOnIntercepts(self, value): """ return self._set(upperBoundsOnIntercepts=value) - @since("2.3.0") - def getUpperBoundsOnIntercepts(self): - """ - Gets the value of :py:attr:`upperBoundsOnIntercepts` - """ - return self.getOrDefault(self.upperBoundsOnIntercepts) - -class LogisticRegressionModel(JavaProbabilisticClassificationModel, JavaMLWritable, JavaMLReadable, - HasTrainingSummary): +class LogisticRegressionModel(JavaProbabilisticClassificationModel, _LogisticRegressionParams, + JavaMLWritable, JavaMLReadable, HasTrainingSummary): """ Model fitted by LogisticRegression. @@ -1647,8 +1672,36 @@ def evaluateEachIteration(self, dataset): return self._call_java("evaluateEachIteration", dataset) +class _NaiveBayesParams(_JavaPredictorParams, HasWeightCol): + """ + Params for :py:class:`NaiveBayes` and :py:class:`NaiveBayesModel`. + + .. versionadded:: 3.0.0 + """ + + smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " + + "default is 1.0", typeConverter=TypeConverters.toFloat) + modelType = Param(Params._dummy(), "modelType", "The model type which is a string " + + "(case-sensitive). Supported options: multinomial (default) and bernoulli.", + typeConverter=TypeConverters.toString) + + @since("1.5.0") + def getSmoothing(self): + """ + Gets the value of smoothing or its default value. + """ + return self.getOrDefault(self.smoothing) + + @since("1.5.0") + def getModelType(self): + """ + Gets the value of modelType or its default value. + """ + return self.getOrDefault(self.modelType) + + @inherit_doc -class NaiveBayes(JavaProbabilisticClassifier, HasThresholds, HasWeightCol, +class NaiveBayes(JavaProbabilisticClassifier, _NaiveBayesParams, HasThresholds, HasWeightCol, JavaMLWritable, JavaMLReadable): """ Naive Bayes Classifiers. @@ -1674,6 +1727,8 @@ class NaiveBayes(JavaProbabilisticClassifier, HasThresholds, HasWeightCol, NaiveBayes_... >>> model.getLabelCol() 'newLabel' + >>> model.getSmoothing() + 1.0 >>> model.pi DenseVector([-0.81..., -0.58...]) >>> model.theta @@ -1712,12 +1767,6 @@ class NaiveBayes(JavaProbabilisticClassifier, HasThresholds, HasWeightCol, .. versionadded:: 1.5.0 """ - smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " + - "default is 1.0", typeConverter=TypeConverters.toFloat) - modelType = Param(Params._dummy(), "modelType", "The model type which is a string " + - "(case-sensitive). Supported options: multinomial (default) and bernoulli.", - typeConverter=TypeConverters.toString) - @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0, @@ -1758,13 +1807,6 @@ def setSmoothing(self, value): """ return self._set(smoothing=value) - @since("1.5.0") - def getSmoothing(self): - """ - Gets the value of smoothing or its default value. - """ - return self.getOrDefault(self.smoothing) - @since("1.5.0") def setModelType(self, value): """ @@ -1772,15 +1814,9 @@ def setModelType(self, value): """ return self._set(modelType=value) - @since("1.5.0") - def getModelType(self): - """ - Gets the value of modelType or its default value. - """ - return self.getOrDefault(self.modelType) - -class NaiveBayesModel(JavaProbabilisticClassificationModel, JavaMLWritable, JavaMLReadable): +class NaiveBayesModel(JavaProbabilisticClassificationModel, _NaiveBayesParams, JavaMLWritable, + JavaMLReadable): """ Model fitted by NaiveBayes. @@ -1804,9 +1840,60 @@ def theta(self): return self._call_java("theta") +class _MultilayerPerceptronParams(_JavaProbabilisticClassifierParams, HasSeed, HasMaxIter, + HasTol, HasStepSize, HasSolver): + """ + Params for :py:class:`MultilayerPerceptronClassifier`. + + .. versionadded:: 3.0.0 + """ + + layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " + + "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " + + "neurons and output layer of 10 neurons.", + typeConverter=TypeConverters.toListInt) + blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " + + "matrices. Data is stacked within partitions. If block size is more than " + + "remaining data in a partition then it is adjusted to the size of this " + + "data. Recommended size is between 10 and 1000, default is 128.", + typeConverter=TypeConverters.toInt) + solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + + "options: l-bfgs, gd.", typeConverter=TypeConverters.toString) + initialWeights = Param(Params._dummy(), "initialWeights", "The initial weights of the model.", + typeConverter=TypeConverters.toVector) + + @since("1.6.0") + def getLayers(self): + """ + Gets the value of layers or its default value. + """ + return self.getOrDefault(self.layers) + + @since("1.6.0") + def getBlockSize(self): + """ + Gets the value of blockSize or its default value. + """ + return self.getOrDefault(self.blockSize) + + @since("2.0.0") + def getStepSize(self): + """ + Gets the value of stepSize or its default value. + """ + return self.getOrDefault(self.stepSize) + + @since("2.0.0") + def getInitialWeights(self): + """ + Gets the value of initialWeights or its default value. + """ + return self.getOrDefault(self.initialWeights) + + @inherit_doc -class MultilayerPerceptronClassifier(JavaProbabilisticClassifier, HasMaxIter, HasTol, HasSeed, - HasStepSize, HasSolver, JavaMLWritable, JavaMLReadable): +class MultilayerPerceptronClassifier(JavaProbabilisticClassifier, _MultilayerPerceptronParams, + JavaMLWritable, JavaMLReadable): """ Classifier trainer based on the Multilayer Perceptron. Each layer has sigmoid activation function, output layer has softmax. @@ -1862,20 +1949,6 @@ class MultilayerPerceptronClassifier(JavaProbabilisticClassifier, HasMaxIter, Ha .. versionadded:: 1.6.0 """ - layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " + - "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " + - "neurons and output layer of 10 neurons.", - typeConverter=TypeConverters.toListInt) - blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " + - "matrices. Data is stacked within partitions. If block size is more than " + - "remaining data in a partition then it is adjusted to the size of this " + - "data. Recommended size is between 10 and 1000, default is 128.", - typeConverter=TypeConverters.toInt) - solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + - "options: l-bfgs, gd.", typeConverter=TypeConverters.toString) - initialWeights = Param(Params._dummy(), "initialWeights", "The initial weights of the model.", - typeConverter=TypeConverters.toVector) - @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, @@ -1920,13 +1993,6 @@ def setLayers(self, value): """ return self._set(layers=value) - @since("1.6.0") - def getLayers(self): - """ - Gets the value of layers or its default value. - """ - return self.getOrDefault(self.layers) - @since("1.6.0") def setBlockSize(self, value): """ @@ -1934,13 +2000,6 @@ def setBlockSize(self, value): """ return self._set(blockSize=value) - @since("1.6.0") - def getBlockSize(self): - """ - Gets the value of blockSize or its default value. - """ - return self.getOrDefault(self.blockSize) - @since("2.0.0") def setStepSize(self, value): """ @@ -1948,13 +2007,6 @@ def setStepSize(self, value): """ return self._set(stepSize=value) - @since("2.0.0") - def getStepSize(self): - """ - Gets the value of stepSize or its default value. - """ - return self.getOrDefault(self.stepSize) - @since("2.0.0") def setInitialWeights(self, value): """ @@ -1962,13 +2014,6 @@ def setInitialWeights(self, value): """ return self._set(initialWeights=value) - @since("2.0.0") - def getInitialWeights(self): - """ - Gets the value of initialWeights or its default value. - """ - return self.getOrDefault(self.initialWeights) - class MultilayerPerceptronClassificationModel(JavaProbabilisticClassificationModel, JavaMLWritable, JavaMLReadable): diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 10450aca2322c..147ebed1d633a 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -24,7 +24,7 @@ _HasVarianceImpurity, _TreeRegressorParams from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \ - JavaPredictor, JavaPredictionModel, JavaWrapper + JavaPredictor, JavaPredictionModel, _JavaPredictorParams, JavaWrapper from pyspark.ml.common import inherit_doc from pyspark.sql import DataFrame @@ -40,10 +40,35 @@ 'RandomForestRegressor', 'RandomForestRegressionModel'] +class _LinearRegressionParams(_JavaPredictorParams, HasRegParam, HasElasticNetParam, HasMaxIter, + HasTol, HasFitIntercept, HasStandardization, HasWeightCol, HasSolver, + HasAggregationDepth, HasLoss): + """ + Params for :py:class:`LinearRegression` and :py:class:`LinearRegressionModel`. + + .. versionadded:: 3.0.0 + """ + + solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + + "options: auto, normal, l-bfgs.", typeConverter=TypeConverters.toString) + + loss = Param(Params._dummy(), "loss", "The loss function to be optimized. Supported " + + "options: squaredError, huber.", typeConverter=TypeConverters.toString) + + epsilon = Param(Params._dummy(), "epsilon", "The shape parameter to control the amount of " + + "robustness. Must be > 1.0. Only valid when loss is huber", + typeConverter=TypeConverters.toFloat) + + @since("2.3.0") + def getEpsilon(self): + """ + Gets the value of epsilon or its default value. + """ + return self.getOrDefault(self.epsilon) + + @inherit_doc -class LinearRegression(JavaPredictor, HasMaxIter, HasRegParam, HasTol, HasElasticNetParam, - HasFitIntercept, HasStandardization, HasSolver, HasWeightCol, - HasAggregationDepth, HasLoss, JavaMLWritable, JavaMLReadable): +class LinearRegression(JavaPredictor, _LinearRegressionParams, JavaMLWritable, JavaMLReadable): """ Linear regression. @@ -73,6 +98,8 @@ class LinearRegression(JavaPredictor, HasMaxIter, HasRegParam, HasTol, HasElasti LinearRegression... >>> model.setPredictionCol("newPrediction") LinearRegression... + >>> model.getMaxIter() + 5 >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> abs(model.predict(test0.head().features) - (-1.0)) < 0.001 True @@ -108,16 +135,6 @@ class LinearRegression(JavaPredictor, HasMaxIter, HasRegParam, HasTol, HasElasti .. versionadded:: 1.4.0 """ - solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + - "options: auto, normal, l-bfgs.", typeConverter=TypeConverters.toString) - - loss = Param(Params._dummy(), "loss", "The loss function to be optimized. Supported " + - "options: squaredError, huber.", typeConverter=TypeConverters.toString) - - epsilon = Param(Params._dummy(), "epsilon", "The shape parameter to control the amount of " + - "robustness. Must be > 1.0. Only valid when loss is huber", - typeConverter=TypeConverters.toFloat) - @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, @@ -162,16 +179,9 @@ def setEpsilon(self, value): """ return self._set(epsilon=value) - @since("2.3.0") - def getEpsilon(self): - """ - Gets the value of epsilon or its default value. - """ - return self.getOrDefault(self.epsilon) - -class LinearRegressionModel(JavaPredictionModel, GeneralJavaMLWritable, JavaMLReadable, - HasTrainingSummary): +class LinearRegressionModel(JavaPredictionModel, _LinearRegressionParams, GeneralJavaMLWritable, + JavaMLReadable, HasTrainingSummary): """ Model fitted by :class:`LinearRegression`. @@ -468,7 +478,7 @@ def totalIterations(self): return self._call_java("totalIterations") -class _IsotonicRegressionBase(HasFeaturesCol, HasLabelCol, HasPredictionCol, HasWeightCol): +class _IsotonicRegressionParams(HasFeaturesCol, HasLabelCol, HasPredictionCol, HasWeightCol): """ Params for :py:class:`IsotonicRegression` and :py:class:`IsotonicRegressionModel`. @@ -498,7 +508,7 @@ def getFeatureIndex(self): @inherit_doc -class IsotonicRegression(JavaEstimator, _IsotonicRegressionBase, HasWeightCol, +class IsotonicRegression(JavaEstimator, _IsotonicRegressionParams, HasWeightCol, JavaMLWritable, JavaMLReadable): """ Currently implemented using parallelized pool adjacent violators algorithm. @@ -577,8 +587,8 @@ def setFeatureIndex(self, value): return self._set(featureIndex=value) -class IsotonicRegressionModel(JavaModel, _IsotonicRegressionBase, - JavaMLWritable, JavaMLReadable): +class IsotonicRegressionModel(JavaModel, _IsotonicRegressionParams, JavaMLWritable, + JavaMLReadable): """ Model fitted by :class:`IsotonicRegression`. @@ -1460,9 +1470,85 @@ def predict(self, features): return self._call_java("predict", features) +class _GeneralizedLinearRegressionParams(_JavaPredictorParams, HasFitIntercept, HasMaxIter, + HasTol, HasRegParam, HasWeightCol, HasSolver): + """ + Params for :py:class:`GeneralizedLinearRegression` and + :py:class:`GeneralizedLinearRegressionModel`. + + .. versionadded:: 3.0.0 + """ + + family = Param(Params._dummy(), "family", "The name of family which is a description of " + + "the error distribution to be used in the model. Supported options: " + + "gaussian (default), binomial, poisson, gamma and tweedie.", + typeConverter=TypeConverters.toString) + link = Param(Params._dummy(), "link", "The name of link function which provides the " + + "relationship between the linear predictor and the mean of the distribution " + + "function. Supported options: identity, log, inverse, logit, probit, cloglog " + + "and sqrt.", typeConverter=TypeConverters.toString) + linkPredictionCol = Param(Params._dummy(), "linkPredictionCol", "link prediction (linear " + + "predictor) column name", typeConverter=TypeConverters.toString) + variancePower = Param(Params._dummy(), "variancePower", "The power in the variance function " + + "of the Tweedie distribution which characterizes the relationship " + + "between the variance and mean of the distribution. Only applicable " + + "for the Tweedie family. Supported values: 0 and [1, Inf).", + typeConverter=TypeConverters.toFloat) + linkPower = Param(Params._dummy(), "linkPower", "The index in the power link function. " + + "Only applicable to the Tweedie family.", + typeConverter=TypeConverters.toFloat) + solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + + "options: irls.", typeConverter=TypeConverters.toString) + offsetCol = Param(Params._dummy(), "offsetCol", "The offset column name. If this is not set " + + "or empty, we treat all instance offsets as 0.0", + typeConverter=TypeConverters.toString) + + @since("2.0.0") + def getFamily(self): + """ + Gets the value of family or its default value. + """ + return self.getOrDefault(self.family) + + @since("2.0.0") + def getLinkPredictionCol(self): + """ + Gets the value of linkPredictionCol or its default value. + """ + return self.getOrDefault(self.linkPredictionCol) + + @since("2.0.0") + def getLink(self): + """ + Gets the value of link or its default value. + """ + return self.getOrDefault(self.link) + + @since("2.2.0") + def getVariancePower(self): + """ + Gets the value of variancePower or its default value. + """ + return self.getOrDefault(self.variancePower) + + @since("2.2.0") + def getLinkPower(self): + """ + Gets the value of linkPower or its default value. + """ + return self.getOrDefault(self.linkPower) + + @since("2.3.0") + def getOffsetCol(self): + """ + Gets the value of offsetCol or its default value. + """ + return self.getOrDefault(self.offsetCol) + + @inherit_doc -class GeneralizedLinearRegression(JavaPredictor, HasFitIntercept, HasMaxIter, HasTol, HasRegParam, - HasWeightCol, HasSolver, JavaMLWritable, JavaMLReadable): +class GeneralizedLinearRegression(JavaPredictor, _GeneralizedLinearRegressionParams, + JavaMLWritable, JavaMLReadable): """ Generalized Linear Regression. @@ -1494,6 +1580,8 @@ class GeneralizedLinearRegression(JavaPredictor, HasFitIntercept, HasMaxIter, Ha >>> model = glr.fit(df) >>> model.setFeaturesCol("features") GeneralizedLinearRegression... + >>> model.getMaxIter() + 25 >>> transformed = model.transform(df) >>> abs(transformed.head().prediction - 1.5) < 0.001 True @@ -1521,30 +1609,6 @@ class GeneralizedLinearRegression(JavaPredictor, HasFitIntercept, HasMaxIter, Ha .. versionadded:: 2.0.0 """ - family = Param(Params._dummy(), "family", "The name of family which is a description of " + - "the error distribution to be used in the model. Supported options: " + - "gaussian (default), binomial, poisson, gamma and tweedie.", - typeConverter=TypeConverters.toString) - link = Param(Params._dummy(), "link", "The name of link function which provides the " + - "relationship between the linear predictor and the mean of the distribution " + - "function. Supported options: identity, log, inverse, logit, probit, cloglog " + - "and sqrt.", typeConverter=TypeConverters.toString) - linkPredictionCol = Param(Params._dummy(), "linkPredictionCol", "link prediction (linear " + - "predictor) column name", typeConverter=TypeConverters.toString) - variancePower = Param(Params._dummy(), "variancePower", "The power in the variance function " + - "of the Tweedie distribution which characterizes the relationship " + - "between the variance and mean of the distribution. Only applicable " + - "for the Tweedie family. Supported values: 0 and [1, Inf).", - typeConverter=TypeConverters.toFloat) - linkPower = Param(Params._dummy(), "linkPower", "The index in the power link function. " + - "Only applicable to the Tweedie family.", - typeConverter=TypeConverters.toFloat) - solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + - "options: irls.", typeConverter=TypeConverters.toString) - offsetCol = Param(Params._dummy(), "offsetCol", "The offset column name. If this is not set " + - "or empty, we treat all instance offsets as 0.0", - typeConverter=TypeConverters.toString) - @keyword_only def __init__(self, labelCol="label", featuresCol="features", predictionCol="prediction", family="gaussian", link=None, fitIntercept=True, maxIter=25, tol=1e-6, @@ -1591,13 +1655,6 @@ def setFamily(self, value): """ return self._set(family=value) - @since("2.0.0") - def getFamily(self): - """ - Gets the value of family or its default value. - """ - return self.getOrDefault(self.family) - @since("2.0.0") def setLinkPredictionCol(self, value): """ @@ -1605,13 +1662,6 @@ def setLinkPredictionCol(self, value): """ return self._set(linkPredictionCol=value) - @since("2.0.0") - def getLinkPredictionCol(self): - """ - Gets the value of linkPredictionCol or its default value. - """ - return self.getOrDefault(self.linkPredictionCol) - @since("2.0.0") def setLink(self, value): """ @@ -1619,13 +1669,6 @@ def setLink(self, value): """ return self._set(link=value) - @since("2.0.0") - def getLink(self): - """ - Gets the value of link or its default value. - """ - return self.getOrDefault(self.link) - @since("2.2.0") def setVariancePower(self, value): """ @@ -1633,13 +1676,6 @@ def setVariancePower(self, value): """ return self._set(variancePower=value) - @since("2.2.0") - def getVariancePower(self): - """ - Gets the value of variancePower or its default value. - """ - return self.getOrDefault(self.variancePower) - @since("2.2.0") def setLinkPower(self, value): """ @@ -1647,13 +1683,6 @@ def setLinkPower(self, value): """ return self._set(linkPower=value) - @since("2.2.0") - def getLinkPower(self): - """ - Gets the value of linkPower or its default value. - """ - return self.getOrDefault(self.linkPower) - @since("2.3.0") def setOffsetCol(self, value): """ @@ -1661,16 +1690,9 @@ def setOffsetCol(self, value): """ return self._set(offsetCol=value) - @since("2.3.0") - def getOffsetCol(self): - """ - Gets the value of offsetCol or its default value. - """ - return self.getOrDefault(self.offsetCol) - -class GeneralizedLinearRegressionModel(JavaPredictionModel, JavaMLWritable, - JavaMLReadable, HasTrainingSummary): +class GeneralizedLinearRegressionModel(JavaPredictionModel, _GeneralizedLinearRegressionParams, + JavaMLWritable, JavaMLReadable, HasTrainingSummary): """ Model fitted by :class:`GeneralizedLinearRegression`.