From fed983193c89743ef652e0719cc6416717a67f66 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 11 Sep 2019 18:53:21 -0700 Subject: [PATCH 1/5] [SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON --- python/pyspark/ml/classification.py | 161 ++++++++++++++++++++------ python/pyspark/ml/regression.py | 53 +++++---- python/pyspark/ml/tests/test_param.py | 3 +- python/pyspark/ml/util.py | 16 --- python/pyspark/ml/wrapper.py | 74 ++++++++++++ 5 files changed, 235 insertions(+), 72 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index b6c28d05ea273..babbdc23e1cfb 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -26,8 +26,8 @@ DecisionTreeRegressionModel, GBTParams, HasVarianceImpurity, RandomForestParams, \ TreeEnsembleModel from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams -from pyspark.ml.wrapper import JavaWrapper +from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \ + JavaPredictor, JavaPredictorParams, JavaPredictionModel, JavaWrapper from pyspark.ml.common import inherit_doc, _java2py, _py2java from pyspark.ml.linalg import Vectors from pyspark.sql import DataFrame @@ -47,14 +47,41 @@ 'OneVsRest', 'OneVsRestModel'] +class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams): + """ + (Private) Java Classifier Params for classification tasks. + """ + pass + + +@inherit_doc +class JavaClassifier(JavaPredictor, JavaClassifierParams): + """ + Java Classifier for classification tasks. + Classes are indexed {0, 1, ..., numClasses - 1}. + """ + + def setRawPredictionCol(self, value): + """ + Sets the value of :py:attr:`rawPredictionCol`. + """ + return self._set(rawPredictionCol=value) + + @inherit_doc -class JavaClassificationModel(JavaPredictionModel): +class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams): """ - (Private) Java Model produced by a ``Classifier``. + Java Model produced by a ``Classifier``. Classes are indexed {0, 1, ..., numClasses - 1}. To be mixed in with class:`pyspark.ml.JavaModel` """ + def setRawPredictionCol(self, value): + """ + Sets the value of :py:attr:`rawPredictionCol`. + """ + return self._set(rawPredictionCol=value) + @property @since("2.1.0") def numClasses(self): @@ -64,10 +91,56 @@ def numClasses(self): return self._call_java("numClasses") +class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams): + """ + (Private) Java Probabilistic Classifier Params for classification tasks. + """ + pass + + @inherit_doc -class LinearSVC(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - HasRegParam, HasTol, HasRawPredictionCol, HasFitIntercept, HasStandardization, - HasWeightCol, HasAggregationDepth, HasThreshold, JavaMLWritable, JavaMLReadable): +class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams): + """ + Java Probabilistic Classifier for classification tasks. + """ + + def setProbabilityCol(self, value): + """ + Sets the value of :py:attr:`probabilityCol`. + """ + return self._set(probabilityCol=value) + + def setThresholds(self, value): + """ + Sets the value of :py:attr:`thresholds`. + """ + return self._set(thresholds=value) + + +@inherit_doc +class JavaProbabilisticClassificationModel(JavaClassificationModel, + JavaProbabilisticClassifierParams): + """ + Java Model produced by a ``ProbabilisticClassifier``. + """ + + def setProbabilityCol(self, value): + """ + Sets the value of :py:attr:`probabilityCol`. + """ + return self._set(probabilityCol=value) + + def setThresholds(self, value): + """ + Sets the value of :py:attr:`thresholds`. + """ + return self._set(thresholds=value) + + +@inherit_doc +class LinearSVC(JavaClassifier, HasMaxIter, HasRegParam, HasTol, + HasFitIntercept, HasStandardization, HasWeightCol, HasAggregationDepth, + HasThreshold, JavaMLWritable, JavaMLReadable): """ `Linear SVM Classifier `_ @@ -81,6 +154,8 @@ class LinearSVC(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, Ha ... Row(label=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() >>> svm = LinearSVC(maxIter=5, regParam=0.01) >>> model = svm.fit(df) + >>> model.setPredictionCol("prediction") + LinearSVC... >>> model.coefficients DenseVector([0.0, -0.2792, -0.1833]) >>> model.intercept @@ -90,6 +165,8 @@ class LinearSVC(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, Ha >>> model.numFeatures 3 >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, -1.0, -1.0))]).toDF() + >>> model.predict(test0.head().features) + 1.0 >>> result = model.transform(test0).head() >>> result.prediction 1.0 @@ -156,7 +233,7 @@ def _create_model(self, java_model): return LinearSVCModel(java_model) -class LinearSVCModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): +class LinearSVCModel(JavaClassificationModel, JavaMLWritable, JavaMLReadable): """ Model fitted by LinearSVC. @@ -181,8 +258,7 @@ def intercept(self): @inherit_doc -class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol, +class LogisticRegression(JavaProbabilisticClassifier, HasMaxIter, HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds, HasWeightCol, HasAggregationDepth, JavaMLWritable, JavaMLReadable): """ @@ -198,6 +274,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti ... Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))]).toDF() >>> blor = LogisticRegression(regParam=0.01, weightCol="weight") >>> blorModel = blor.fit(bdf) + >>> blorModel.setFeaturesCol("features") + LogisticRegressionModel... >>> blorModel.coefficients DenseVector([-1.080..., -0.646...]) >>> blorModel.intercept @@ -211,6 +289,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti >>> mlorModel.interceptVector DenseVector([0.04..., -0.42..., 0.37...]) >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 1.0))]).toDF() + >>> blorModel.predict(test0.head().features) + 1.0 >>> result = blorModel.transform(test0).head() >>> result.prediction 1.0 @@ -481,7 +561,7 @@ def getUpperBoundsOnIntercepts(self): return self.getOrDefault(self.upperBoundsOnIntercepts) -class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable, +class LogisticRegressionModel(JavaProbabilisticClassificationModel, JavaMLWritable, JavaMLReadable, HasTrainingSummary): """ Model fitted by LogisticRegression. @@ -872,8 +952,7 @@ def getImpurity(self): @inherit_doc -class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasWeightCol, - HasPredictionCol, HasProbabilityCol, HasRawPredictionCol, +class DecisionTreeClassifier(JavaProbabilisticClassifier, HasWeightCol, DecisionTreeParams, TreeClassifierParams, HasCheckpointInterval, HasSeed, JavaMLWritable, JavaMLReadable): """ @@ -892,6 +971,10 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasWeig >>> td = si_model.transform(df) >>> dt = DecisionTreeClassifier(maxDepth=2, labelCol="indexed", leafCol="leafId") >>> model = dt.fit(td) + >>> model.getLabelCol() + 'indexed' + >>> model.setFeaturesCol("features") + DecisionTreeClassificationModel... >>> model.numNodes 3 >>> model.depth @@ -905,6 +988,8 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasWeig >>> print(model.toDebugString) DecisionTreeClassificationModel (uid=...) of depth 1 with 3 nodes... >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) + >>> model.predict(test0.head().features) + 0.0 >>> result = model.transform(test0).head() >>> result.prediction 0.0 @@ -1031,8 +1116,8 @@ def setImpurity(self, value): @inherit_doc -class DecisionTreeClassificationModel(DecisionTreeModel, JavaClassificationModel, JavaMLWritable, - JavaMLReadable): +class DecisionTreeClassificationModel(DecisionTreeModel, JavaProbabilisticClassificationModel, + JavaMLWritable, JavaMLReadable): """ Model fitted by DecisionTreeClassifier. @@ -1062,9 +1147,8 @@ def featureImportances(self): @inherit_doc -class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed, - HasRawPredictionCol, HasProbabilityCol, - RandomForestParams, TreeClassifierParams, HasCheckpointInterval, +class RandomForestClassifier(JavaProbabilisticClassifier, HasSeed, RandomForestParams, + TreeClassifierParams, HasCheckpointInterval, JavaMLWritable, JavaMLReadable): """ `Random Forest `_ @@ -1231,8 +1315,8 @@ def setFeatureSubsetStrategy(self, value): return self._set(featureSubsetStrategy=value) -class RandomForestClassificationModel(TreeEnsembleModel, JavaClassificationModel, JavaMLWritable, - JavaMLReadable): +class RandomForestClassificationModel(TreeEnsembleModel, JavaProbabilisticClassificationModel, + JavaMLWritable, JavaMLReadable): """ Model fitted by RandomForestClassifier. @@ -1284,9 +1368,8 @@ def getLossType(self): @inherit_doc -class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - GBTClassifierParams, HasCheckpointInterval, HasSeed, JavaMLWritable, - JavaMLReadable): +class GBTClassifier(JavaProbabilisticClassifier, GBTClassifierParams, HasCheckpointInterval, + HasSeed, JavaMLWritable, JavaMLReadable): """ `Gradient-Boosted Trees (GBTs) `_ learning algorithm for classification. @@ -1318,11 +1401,17 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol >>> gbt.getFeatureSubsetStrategy() 'all' >>> model = gbt.fit(td) + >>> model.getLabelCol() + 'indexed' + >>> model.setFeaturesCol("features") + GBTClassificationModel... >>> model.featureImportances SparseVector(1, {0: 1.0}) >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) True >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) + >>> model.predict(test0.head().features) + 0.0 >>> result = model.transform(test0).head() >>> result.prediction 0.0 @@ -1485,8 +1574,8 @@ def setValidationIndicatorCol(self, value): return self._set(validationIndicatorCol=value) -class GBTClassificationModel(TreeEnsembleModel, JavaClassificationModel, JavaMLWritable, - JavaMLReadable): +class GBTClassificationModel(TreeEnsembleModel, JavaProbabilisticClassificationModel, + JavaMLWritable, JavaMLReadable): """ Model fitted by GBTClassifier. @@ -1527,8 +1616,8 @@ def evaluateEachIteration(self, dataset): @inherit_doc -class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasProbabilityCol, - HasRawPredictionCol, HasThresholds, HasWeightCol, JavaMLWritable, JavaMLReadable): +class NaiveBayes(JavaProbabilisticClassifier, HasThresholds, HasWeightCol, + JavaMLWritable, JavaMLReadable): """ Naive Bayes Classifiers. It supports both Multinomial and Bernoulli NB. `Multinomial NB @@ -1547,11 +1636,15 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H ... Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))]) >>> nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight") >>> model = nb.fit(df) + >>> model.setFeaturesCol("features") + NaiveBayes_... >>> model.pi DenseVector([-0.81..., -0.58...]) >>> model.theta DenseMatrix(2, 2, [-0.91..., -0.51..., -0.40..., -1.09...], 1) >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() + >>> model.predict(test0.head().features) + 1.0 >>> result = model.transform(test0).head() >>> result.prediction 1.0 @@ -1651,7 +1744,7 @@ def getModelType(self): return self.getOrDefault(self.modelType) -class NaiveBayesModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): +class NaiveBayesModel(JavaProbabilisticClassificationModel, JavaMLWritable, JavaMLReadable): """ Model fitted by NaiveBayes. @@ -1676,10 +1769,8 @@ def theta(self): @inherit_doc -class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - HasMaxIter, HasTol, HasSeed, HasStepSize, HasSolver, - JavaMLWritable, JavaMLReadable, HasProbabilityCol, - HasRawPredictionCol): +class MultilayerPerceptronClassifier(JavaProbabilisticClassifier, HasMaxIter, HasTol, HasSeed, + HasStepSize, HasSolver, JavaMLWritable, JavaMLReadable): """ Classifier trainer based on the Multilayer Perceptron. Each layer has sigmoid activation function, output layer has softmax. @@ -1694,6 +1785,8 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, ... (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"]) >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 2, 2], blockSize=1, seed=123) >>> model = mlp.fit(df) + >>> model.setFeaturesCol("features") + MultilayerPerceptronClassifier... >>> model.layers [2, 2, 2] >>> model.weights.size @@ -1701,6 +1794,8 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, >>> testDF = spark.createDataFrame([ ... (Vectors.dense([1.0, 0.0]),), ... (Vectors.dense([0.0, 0.0]),)], ["features"]) + >>> model.predict(testDF.head().features) + 1.0 >>> model.transform(testDF).select("features", "prediction").show() +---------+----------+ | features|prediction| @@ -1839,7 +1934,7 @@ def getInitialWeights(self): return self.getOrDefault(self.initialWeights) -class MultilayerPerceptronClassificationModel(JavaModel, JavaClassificationModel, JavaMLWritable, +class MultilayerPerceptronClassificationModel(JavaProbabilisticClassificationModel, JavaMLWritable, JavaMLReadable): """ Model fitted by MultilayerPerceptronClassifier. diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 2d1d1272c17f8..207b1720f8fa4 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -20,7 +20,8 @@ from pyspark import since, keyword_only from pyspark.ml.param.shared import * from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper +from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \ + JavaPredictor, JavaPredictionModel, JavaWrapper from pyspark.ml.common import inherit_doc from pyspark.sql import DataFrame @@ -37,10 +38,9 @@ @inherit_doc -class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept, - HasStandardization, HasSolver, HasWeightCol, HasAggregationDepth, HasLoss, - JavaMLWritable, JavaMLReadable): +class LinearRegression(JavaPredictor, HasMaxIter, HasRegParam, HasTol, HasElasticNetParam, + HasFitIntercept, HasStandardization, HasSolver, HasWeightCol, + HasAggregationDepth, HasLoss, JavaMLWritable, JavaMLReadable): """ Linear regression. @@ -66,7 +66,11 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction ... (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) >>> lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight") >>> model = lr.fit(df) + >>> model.setFeaturesCol("features") + LinearRegression... >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) + >>> abs(model.predict(test0.head().features) - (-1.0)) < 0.001 + True >>> abs(model.transform(test0).head().prediction - (-1.0)) < 0.001 True >>> abs(model.coefficients[0] - 1.0) < 0.001 @@ -161,7 +165,7 @@ def getEpsilon(self): return self.getOrDefault(self.epsilon) -class LinearRegressionModel(JavaModel, JavaPredictionModel, GeneralJavaMLWritable, JavaMLReadable, +class LinearRegressionModel(JavaPredictionModel, GeneralJavaMLWritable, JavaMLReadable, HasTrainingSummary): """ Model fitted by :class:`LinearRegression`. @@ -804,8 +808,7 @@ def getLossType(self): @inherit_doc -class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasWeightCol, - HasPredictionCol, DecisionTreeParams, TreeRegressorParams, +class DecisionTreeRegressor(JavaPredictor, HasWeightCol, DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval, HasSeed, JavaMLWritable, JavaMLReadable, HasVarianceCol): """ @@ -828,6 +831,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasWeigh >>> model.numFeatures 1 >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) + >>> model.predict(test0.head().features) + 0.0 >>> result = model.transform(test0).head() >>> result.prediction 0.0 @@ -950,7 +955,7 @@ def setImpurity(self, value): @inherit_doc -class DecisionTreeModel(JavaModel, JavaPredictionModel): +class DecisionTreeModel(JavaPredictionModel): """ Abstraction for Decision Tree models. @@ -1052,9 +1057,8 @@ def featureImportances(self): @inherit_doc -class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed, - RandomForestParams, TreeRegressorParams, HasCheckpointInterval, - JavaMLWritable, JavaMLReadable): +class RandomForestRegressor(JavaPredictor, HasSeed, RandomForestParams, TreeRegressorParams, + HasCheckpointInterval, JavaMLWritable, JavaMLReadable): """ `Random Forest `_ learning algorithm for regression. @@ -1072,6 +1076,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi >>> allclose(model.treeWeights, [1.0, 1.0]) True >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) + >>> model.predict(test0.head().features) + 0.0 >>> result = model.transform(test0).head() >>> result.prediction 0.0 @@ -1240,9 +1246,8 @@ def featureImportances(self): @inherit_doc -class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - GBTRegressorParams, HasCheckpointInterval, HasSeed, JavaMLWritable, - JavaMLReadable): +class GBTRegressor(JavaPredictor, GBTRegressorParams, HasCheckpointInterval, HasSeed, + JavaMLWritable, JavaMLReadable): """ `Gradient-Boosted Trees (GBTs) `_ learning algorithm for regression. @@ -1259,6 +1264,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, >>> print(gbt.getFeatureSubsetStrategy()) all >>> model = gbt.fit(df) + >>> model.setFeaturesCol("features") + GBTRegressionModel... >>> model.featureImportances SparseVector(1, {0: 1.0}) >>> model.numFeatures @@ -1266,6 +1273,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) True >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) + >>> model.predict(test0.head().features) + 0.0 >>> result = model.transform(test0).head() >>> result.prediction 0.0 @@ -1465,9 +1474,8 @@ def evaluateEachIteration(self, dataset, loss): @inherit_doc -class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - HasFitIntercept, HasMaxIter, HasTol, HasAggregationDepth, - JavaMLWritable, JavaMLReadable): +class AFTSurvivalRegression(JavaPredictor, HasFitIntercept, HasMaxIter, HasTol, + HasAggregationDepth, JavaMLWritable, JavaMLReadable): """ Accelerated Failure Time (AFT) Model Survival Regression @@ -1654,9 +1662,8 @@ def predict(self, features): @inherit_doc -class GeneralizedLinearRegression(JavaEstimator, HasLabelCol, HasFeaturesCol, HasPredictionCol, - HasFitIntercept, HasMaxIter, HasTol, HasRegParam, HasWeightCol, - HasSolver, JavaMLWritable, JavaMLReadable): +class GeneralizedLinearRegression(JavaPredictor, HasFitIntercept, HasMaxIter, HasTol, HasRegParam, + HasWeightCol, HasSolver, JavaMLWritable, JavaMLReadable): """ Generalized Linear Regression. @@ -1686,6 +1693,8 @@ class GeneralizedLinearRegression(JavaEstimator, HasLabelCol, HasFeaturesCol, Ha ... (2.0, Vectors.dense(1.0, 1.0)),], ["label", "features"]) >>> glr = GeneralizedLinearRegression(family="gaussian", link="identity", linkPredictionCol="p") >>> model = glr.fit(df) + >>> model.setFeaturesCol("features") + GeneralizedLinearRegression... >>> transformed = model.transform(df) >>> abs(transformed.head().prediction - 1.5) < 0.001 True @@ -1861,7 +1870,7 @@ def getOffsetCol(self): return self.getOrDefault(self.offsetCol) -class GeneralizedLinearRegressionModel(JavaModel, JavaPredictionModel, JavaMLWritable, +class GeneralizedLinearRegressionModel(JavaPredictionModel, JavaMLWritable, JavaMLReadable, HasTrainingSummary): """ Model fitted by :class:`GeneralizedLinearRegression`. diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index cbeac0b1319c8..68eea0a7cc0aa 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -343,7 +343,8 @@ def test_java_params(self): for module in modules: for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and not name.endswith('Params') \ - and issubclass(cls, JavaParams) and not inspect.isabstract(cls): + and issubclass(cls, JavaParams) and not inspect.isabstract(cls) \ + and not name.startswith('Java'): # NOTE: disable check_params_exist until there is parity with Scala API check_params(self, cls(), check_params_exist=False) diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index debaf38e0896e..81b785e71f302 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -342,22 +342,6 @@ def read(cls): return JavaMLReader(cls) -@inherit_doc -class JavaPredictionModel(): - """ - (Private) Java Model for prediction tasks (regression and classification). - To be mixed in with class:`pyspark.ml.JavaModel` - """ - - @property - @since("2.1.0") - def numFeatures(self): - """ - Returns the number of features the model was trained on. If unknown, returns -1 - """ - return self._call_java("numFeatures") - - @inherit_doc class DefaultParamsWritable(MLWritable): """ diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 9bb1262a54500..3137db7181c07 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -20,10 +20,12 @@ if sys.version >= '3': xrange = range +from pyspark import since from pyspark import SparkContext from pyspark.sql import DataFrame from pyspark.ml import Estimator, Transformer, Model from pyspark.ml.param import Params +from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol from pyspark.ml.util import _jvm from pyspark.ml.common import inherit_doc, _java2py, _py2java @@ -361,3 +363,75 @@ def __init__(self, java_model=None): self._create_params_from_java() self._resetUid(java_model.uid()) + + +@inherit_doc +class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol): + """ + (Private) Trait for parameters for prediction (regression and classification) + """ + pass + + +@inherit_doc +class JavaPredictor(JavaEstimator, JavaPredictorParams): + """ + (Private) Java Estimator for prediction tasks (regression and classification). + """ + + @since("3.0.0") + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + + @since("3.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("3.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + +@inherit_doc +class JavaPredictionModel(JavaModel, JavaPredictorParams): + """ + (Private) Java Model for prediction tasks (regression and classification). + """ + + @since("3.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("3.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + @property + @since("2.1.0") + def numFeatures(self): + """ + Returns the number of features the model was trained on. If unknown, returns -1 + """ + return self._call_java("numFeatures") + + @since("3.0.0") + def predict(self, value): + """ + Predict label for the given features. + """ + return self._call_java("predict", value) From d6c7da2db4bc5b3286ad5b3cb5b5840223a4ff6f Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 12 Sep 2019 10:24:41 -0700 Subject: [PATCH 2/5] add a few changes --- python/pyspark/ml/regression.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 207b1720f8fa4..3240a596a56b5 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -464,8 +464,7 @@ def totalIterations(self): @inherit_doc -class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - HasWeightCol, JavaMLWritable, JavaMLReadable): +class IsotonicRegression(JavaPredictor, HasWeightCol, JavaMLWritable, JavaMLReadable): """ Currently implemented using parallelized pool adjacent violators algorithm. Only univariate (single feature) algorithm supported. @@ -476,6 +475,8 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) >>> ir = IsotonicRegression() >>> model = ir.fit(df) + >>> model.setFeaturesCol("features") + IsotonicRegression... >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -559,7 +560,7 @@ def getFeatureIndex(self): return self.getOrDefault(self.featureIndex) -class IsotonicRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): +class IsotonicRegressionModel(JavaPredictionModel, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`IsotonicRegression`. @@ -1490,6 +1491,8 @@ class AFTSurvivalRegression(JavaPredictor, HasFitIntercept, HasMaxIter, HasTol, ... (1e-40, Vectors.sparse(1, [], []), 0.0)], ["label", "features", "censor"]) >>> aftsr = AFTSurvivalRegression() >>> model = aftsr.fit(df) + >>> model.setFeaturesCol("features") + AFTSurvivalRegression... >>> model.predict(Vectors.dense(6.3)) 1.0 >>> model.predictQuantiles(Vectors.dense(6.3)) @@ -1615,7 +1618,7 @@ def getQuantilesCol(self): return self.getOrDefault(self.quantilesCol) -class AFTSurvivalRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): +class AFTSurvivalRegressionModel(JavaPredictionModel, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`AFTSurvivalRegression`. From de6f8e8cd44a2ea9fac01a231c08f17c516e3bdf Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 16 Sep 2019 16:44:41 -0700 Subject: [PATCH 3/5] address comments --- python/pyspark/ml/classification.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index babbdc23e1cfb..7f2ca8dd17ae2 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -61,6 +61,7 @@ class JavaClassifier(JavaPredictor, JavaClassifierParams): Classes are indexed {0, 1, ..., numClasses - 1}. """ + @since("3.0.0") def setRawPredictionCol(self, value): """ Sets the value of :py:attr:`rawPredictionCol`. @@ -76,6 +77,7 @@ class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams): To be mixed in with class:`pyspark.ml.JavaModel` """ + @since("3.0.0") def setRawPredictionCol(self, value): """ Sets the value of :py:attr:`rawPredictionCol`. @@ -104,12 +106,14 @@ class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierPar Java Probabilistic Classifier for classification tasks. """ + @since("3.0.0") def setProbabilityCol(self, value): """ Sets the value of :py:attr:`probabilityCol`. """ return self._set(probabilityCol=value) + @since("3.0.0") def setThresholds(self, value): """ Sets the value of :py:attr:`thresholds`. @@ -124,12 +128,14 @@ class JavaProbabilisticClassificationModel(JavaClassificationModel, Java Model produced by a ``ProbabilisticClassifier``. """ + @since("3.0.0") def setProbabilityCol(self, value): """ Sets the value of :py:attr:`probabilityCol`. """ return self._set(probabilityCol=value) + @since("3.0.0") def setThresholds(self, value): """ Sets the value of :py:attr:`thresholds`. @@ -1959,8 +1965,7 @@ def weights(self): return self._call_java("weights") -class OneVsRestParams(HasFeaturesCol, HasLabelCol, HasWeightCol, HasPredictionCol, - HasRawPredictionCol): +class OneVsRestParams(JavaClassifierParams, HasWeightCol): """ Parameters for OneVsRest and OneVsRestModel. """ From 95f88f59da68827d103ffd68b2093e90d75c93c1 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 17 Sep 2019 13:57:13 -0700 Subject: [PATCH 4/5] add simple doc test for RandomForestClassificationModel --- python/pyspark/ml/classification.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 7f2ca8dd17ae2..aa21fddaa1093 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1175,11 +1175,17 @@ class RandomForestClassifier(JavaProbabilisticClassifier, HasSeed, RandomForestP >>> rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42, ... leafCol="leafId") >>> model = rf.fit(td) + >>> model.getLabelCol() + 'indexed' + >>> model.setFeaturesCol("features") + RandomForestClassificationModel... >>> model.featureImportances SparseVector(1, {0: 1.0}) >>> allclose(model.treeWeights, [1.0, 1.0, 1.0]) True >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) + >>> model.predict(test0.head().features) + 0.0 >>> result = model.transform(test0).head() >>> result.prediction 0.0 From bc1d9e1ef4a27c9c619c10378d2a48ae0cad7836 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 18 Sep 2019 09:25:50 -0700 Subject: [PATCH 5/5] address comments and add a few more doctests --- python/pyspark/ml/classification.py | 38 ++++++++++++++++++++++------- python/pyspark/ml/regression.py | 10 ++++++-- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index aa21fddaa1093..e2e313c7f9252 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -160,8 +160,10 @@ class LinearSVC(JavaClassifier, HasMaxIter, HasRegParam, HasTol, ... Row(label=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() >>> svm = LinearSVC(maxIter=5, regParam=0.01) >>> model = svm.fit(df) - >>> model.setPredictionCol("prediction") + >>> model.setPredictionCol("newPrediction") LinearSVC... + >>> model.getPredictionCol() + 'newPrediction' >>> model.coefficients DenseVector([0.0, -0.2792, -0.1833]) >>> model.intercept @@ -174,7 +176,7 @@ class LinearSVC(JavaClassifier, HasMaxIter, HasRegParam, HasTol, >>> model.predict(test0.head().features) 1.0 >>> result = model.transform(test0).head() - >>> result.prediction + >>> result.newPrediction 1.0 >>> result.rawPrediction DenseVector([-1.4831, 1.4831]) @@ -282,6 +284,10 @@ class LogisticRegression(JavaProbabilisticClassifier, HasMaxIter, HasRegParam, H >>> blorModel = blor.fit(bdf) >>> blorModel.setFeaturesCol("features") LogisticRegressionModel... + >>> blorModel.setProbabilityCol("newProbability") + LogisticRegressionModel... + >>> blorModel.getProbabilityCol() + 'newProbability' >>> blorModel.coefficients DenseVector([-1.080..., -0.646...]) >>> blorModel.intercept @@ -300,7 +306,7 @@ class LogisticRegression(JavaProbabilisticClassifier, HasMaxIter, HasRegParam, H >>> result = blorModel.transform(test0).head() >>> result.prediction 1.0 - >>> result.probability + >>> result.newProbability DenseVector([0.02..., 0.97...]) >>> result.rawPrediction DenseVector([-3.54..., 3.54...]) @@ -1179,6 +1185,10 @@ class RandomForestClassifier(JavaProbabilisticClassifier, HasSeed, RandomForestP 'indexed' >>> model.setFeaturesCol("features") RandomForestClassificationModel... + >>> model.setRawPredictionCol("newRawPrediction") + RandomForestClassificationModel... + >>> model.getRawPredictionCol() + 'newRawPrediction' >>> model.featureImportances SparseVector(1, {0: 1.0}) >>> allclose(model.treeWeights, [1.0, 1.0, 1.0]) @@ -1191,7 +1201,7 @@ class RandomForestClassifier(JavaProbabilisticClassifier, HasSeed, RandomForestP 0.0 >>> numpy.argmax(result.probability) 0 - >>> numpy.argmax(result.rawPrediction) + >>> numpy.argmax(result.newRawPrediction) 0 >>> result.leafId DenseVector([0.0, 0.0, 0.0]) @@ -1417,6 +1427,10 @@ class GBTClassifier(JavaProbabilisticClassifier, GBTClassifierParams, HasCheckpo 'indexed' >>> model.setFeaturesCol("features") GBTClassificationModel... + >>> model.setThresholds([0.3, 0.7]) + GBTClassificationModel... + >>> model.getThresholds() + [0.3, 0.7] >>> model.featureImportances SparseVector(1, {0: 1.0}) >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) @@ -1650,6 +1664,10 @@ class NaiveBayes(JavaProbabilisticClassifier, HasThresholds, HasWeightCol, >>> model = nb.fit(df) >>> model.setFeaturesCol("features") NaiveBayes_... + >>> model.setLabelCol("newLabel") + NaiveBayes_... + >>> model.getLabelCol() + 'newLabel' >>> model.pi DenseVector([-0.81..., -0.58...]) >>> model.theta @@ -2003,6 +2021,8 @@ class OneVsRest(Estimator, OneVsRestParams, HasParallelism, JavaMLReadable, Java >>> ovr = OneVsRest(classifier=lr) >>> ovr.getRawPredictionCol() 'rawPrediction' + >>> ovr.setPredictionCol("newPrediction") + OneVsRest... >>> model = ovr.fit(df) >>> model.models[0].coefficients DenseVector([0.5..., -1.0..., 3.4..., 4.2...]) @@ -2013,21 +2033,21 @@ class OneVsRest(Estimator, OneVsRestParams, HasParallelism, JavaMLReadable, Java >>> [x.intercept for x in model.models] [-2.7..., -2.5..., -1.3...] >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 0.0, 1.0, 1.0))]).toDF() - >>> model.transform(test0).head().prediction + >>> model.transform(test0).head().newPrediction 0.0 >>> test1 = sc.parallelize([Row(features=Vectors.sparse(4, [0], [1.0]))]).toDF() - >>> model.transform(test1).head().prediction + >>> model.transform(test1).head().newPrediction 2.0 >>> test2 = sc.parallelize([Row(features=Vectors.dense(0.5, 0.4, 0.3, 0.2))]).toDF() - >>> model.transform(test2).head().prediction + >>> model.transform(test2).head().newPrediction 0.0 >>> model_path = temp_path + "/ovr_model" >>> model.save(model_path) >>> model2 = OneVsRestModel.load(model_path) - >>> model2.transform(test0).head().prediction + >>> model2.transform(test0).head().newPrediction 0.0 >>> model.transform(test2).columns - ['features', 'rawPrediction', 'prediction'] + ['features', 'rawPrediction', 'newPrediction'] .. versionadded:: 2.0.0 """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 3240a596a56b5..f2bcc662030c6 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -68,17 +68,19 @@ class LinearRegression(JavaPredictor, HasMaxIter, HasRegParam, HasTol, HasElasti >>> model = lr.fit(df) >>> model.setFeaturesCol("features") LinearRegression... + >>> model.setPredictionCol("newPrediction") + LinearRegression... >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> abs(model.predict(test0.head().features) - (-1.0)) < 0.001 True - >>> abs(model.transform(test0).head().prediction - (-1.0)) < 0.001 + >>> abs(model.transform(test0).head().newPrediction - (-1.0)) < 0.001 True >>> abs(model.coefficients[0] - 1.0) < 0.001 True >>> abs(model.intercept - 0.0) < 0.001 True >>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) - >>> abs(model.transform(test1).head().prediction - 1.0) < 0.001 + >>> abs(model.transform(test1).head().newPrediction - 1.0) < 0.001 True >>> lr.setParams("vector") Traceback (most recent call last): @@ -477,6 +479,10 @@ class IsotonicRegression(JavaPredictor, HasWeightCol, JavaMLWritable, JavaMLRead >>> model = ir.fit(df) >>> model.setFeaturesCol("features") IsotonicRegression... + >>> model.setLabelCol("newLabel") + IsotonicRegression... + >>> model.getLabelCol() + 'newLabel' >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0