From 4cc6ffb4c0cadb46e60c72f3546064cabceb78ec Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 25 Sep 2019 11:25:51 -0700 Subject: [PATCH 1/6] [SPARK-29116][PYTHON][ML] Refactor py classes related to DecisionTree --- python/pyspark/ml/classification.py | 98 +++---- python/pyspark/ml/regression.py | 403 ++++++---------------------- python/pyspark/ml/tree.py | 348 ++++++++++++++++++++++++ 3 files changed, 483 insertions(+), 366 deletions(-) create mode 100644 python/pyspark/ml/tree.py diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index e2e313c7f9252..433cb631bacc7 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -22,9 +22,10 @@ from pyspark import since, keyword_only from pyspark.ml import Estimator, Model from pyspark.ml.param.shared import * -from pyspark.ml.regression import DecisionTreeModel, DecisionTreeParams, \ - DecisionTreeRegressionModel, GBTParams, HasVarianceImpurity, RandomForestParams, \ - TreeEnsembleModel +from pyspark.ml.tree import DecisionTreeModel, DecisionTreeParams, \ + TreeEnsembleModel, RandomForestParams, GBTParams, \ + HasVarianceImpurity, TreeClassifierParams, TreeEnsembleParams +from pyspark.ml.regression import DecisionTreeRegressionModel from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \ JavaPredictor, JavaPredictorParams, JavaPredictionModel, JavaWrapper @@ -939,34 +940,18 @@ class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, pass -class TreeClassifierParams(object): +@inherit_doc +class DecisionTreeClassifierParams(DecisionTreeParams, TreeClassifierParams, + JavaProbabilisticClassifierParams): """ - Private class to track supported impurity measures. - - .. versionadded:: 1.4.0 + (Private) Params for DecisionTree Classifier. """ - supportedImpurities = ["entropy", "gini"] - - impurity = Param(Params._dummy(), "impurity", - "Criterion used for information gain calculation (case-insensitive). " + - "Supported options: " + - ", ".join(supportedImpurities), typeConverter=TypeConverters.toString) - - def __init__(self): - super(TreeClassifierParams, self).__init__() - - @since("1.6.0") - def getImpurity(self): - """ - Gets the value of impurity or its default value. - """ - return self.getOrDefault(self.impurity) + pass @inherit_doc -class DecisionTreeClassifier(JavaProbabilisticClassifier, HasWeightCol, - DecisionTreeParams, TreeClassifierParams, HasCheckpointInterval, - HasSeed, JavaMLWritable, JavaMLReadable): +class DecisionTreeClassifier(JavaProbabilisticClassifier, DecisionTreeClassifierParams, + JavaMLWritable, JavaMLReadable): """ `Decision tree `_ learning algorithm for classification. @@ -1045,20 +1030,20 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", - seed=None, weightCol=None, leafCol=""): + seed=None, weightCol=None, leafCol="", minWeightFractionPerNode=0.0): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ probabilityCol="probability", rawPredictionCol="rawPrediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \ - seed=None, weightCol=None, leafCol="") + seed=None, weightCol=None, leafCol="", minWeightFractionPerNode=0.0) """ super(DecisionTreeClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.DecisionTreeClassifier", self.uid) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="gini", leafCol="") + impurity="gini", leafCol="", minWeightFractionPerNode=0.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1068,13 +1053,14 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="gini", seed=None, weightCol=None, leafCol=""): + impurity="gini", seed=None, weightCol=None, leafCol="", + minWeightFractionPerNode=0.0): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ probabilityCol="probability", rawPredictionCol="rawPrediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \ - seed=None, weightCol=None, leafCol="") + seed=None, weightCol=None, leafCol="", minWeightFractionPerNode=0.0) Sets params for the DecisionTreeClassifier. """ kwargs = self._input_kwargs @@ -1101,6 +1087,13 @@ def setMinInstancesPerNode(self, value): """ return self._set(minInstancesPerNode=value) + @since("3.0.0") + def setMinWeightFractionPerNode(self, value): + """ + Sets the value of :py:attr:`minWeightFractionPerNode`. + """ + return self._set(minWeightFractionPerNode=value) + def setMinInfoGain(self, value): """ Sets the value of :py:attr:`minInfoGain`. @@ -1129,7 +1122,8 @@ def setImpurity(self, value): @inherit_doc class DecisionTreeClassificationModel(DecisionTreeModel, JavaProbabilisticClassificationModel, - JavaMLWritable, JavaMLReadable): + DecisionTreeClassifierParams, JavaMLWritable, + JavaMLReadable): """ Model fitted by DecisionTreeClassifier. @@ -1159,8 +1153,15 @@ def featureImportances(self): @inherit_doc -class RandomForestClassifier(JavaProbabilisticClassifier, HasSeed, RandomForestParams, - TreeClassifierParams, HasCheckpointInterval, +class RandomForestClassifierParams(RandomForestParams, TreeClassifierParams): + """ + (Private) Params for RandomForest Classifier. + """ + pass + + +@inherit_doc +class RandomForestClassifier(JavaProbabilisticClassifier, RandomForestClassifierParams, JavaMLWritable, JavaMLReadable): """ `Random Forest `_ @@ -1230,14 +1231,14 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0, - leafCol=""): + leafCol="", minWeightFractionPerNode=0.0): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ probabilityCol="probability", rawPredictionCol="rawPrediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \ numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0, \ - leafCol="") + leafCol="", minWeightFractionPerNode=0.0) """ super(RandomForestClassifier, self).__init__() self._java_obj = self._new_java_obj( @@ -1245,7 +1246,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto", - subsamplingRate=1.0, leafCol="") + subsamplingRate=1.0, leafCol="", minWeightFractionPerNode=0.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1256,14 +1257,14 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0, - leafCol=""): + leafCol="", minWeightFractionPerNode=0.0): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ probabilityCol="probability", rawPredictionCol="rawPrediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \ impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0, \ - leafCol="") + leafCol="", minWeightFractionPerNode=0.0) Sets params for linear classification. """ kwargs = self._input_kwargs @@ -1338,7 +1339,8 @@ def setFeatureSubsetStrategy(self, value): class RandomForestClassificationModel(TreeEnsembleModel, JavaProbabilisticClassificationModel, - JavaMLWritable, JavaMLReadable): + RandomForestClassifierParams, JavaMLWritable, + JavaMLReadable): """ Model fitted by RandomForestClassifier. @@ -1390,8 +1392,8 @@ def getLossType(self): @inherit_doc -class GBTClassifier(JavaProbabilisticClassifier, GBTClassifierParams, HasCheckpointInterval, - HasSeed, JavaMLWritable, JavaMLReadable): +class GBTClassifier(JavaProbabilisticClassifier, GBTClassifierParams, + JavaMLWritable, JavaMLReadable): """ `Gradient-Boosted Trees (GBTs) `_ learning algorithm for classification. @@ -1485,14 +1487,14 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, impurity="variance", featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None, - leafCol=""): + leafCol="", minWeightFractionPerNode=0.0): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ impurity="variance", featureSubsetStrategy="all", validationTol=0.01, \ - validationIndicatorCol=None, leafCol="") + validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0) """ super(GBTClassifier, self).__init__() self._java_obj = self._new_java_obj( @@ -1501,7 +1503,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0, impurity="variance", featureSubsetStrategy="all", validationTol=0.01, - leafCol="") + leafCol="", minWeightFractionPerNode=0.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1512,14 +1514,14 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, impurity="variance", featureSubsetStrategy="all", validationTol=0.01, - validationIndicatorCol=None, leafCol=""): + validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ impurity="variance", featureSubsetStrategy="all", validationTol=0.01, \ - validationIndicatorCol=None, leafCol="") + validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0) Sets params for Gradient Boosted Tree Classification. """ kwargs = self._input_kwargs @@ -1601,7 +1603,7 @@ def setValidationIndicatorCol(self, value): class GBTClassificationModel(TreeEnsembleModel, JavaProbabilisticClassificationModel, - JavaMLWritable, JavaMLReadable): + GBTClassifierParams, JavaMLWritable, JavaMLReadable): """ Model fitted by GBTClassifier. diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index f2bcc662030c6..637334d8adc61 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -19,6 +19,9 @@ from pyspark import since, keyword_only from pyspark.ml.param.shared import * +from pyspark.ml.tree import DecisionTreeModel, DecisionTreeParams, \ + TreeEnsembleModel, TreeEnsembleParams, RandomForestParams, GBTParams, \ + HasVarianceImpurity, TreeRegressorParams from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \ JavaPredictor, JavaPredictionModel, JavaWrapper @@ -591,233 +594,19 @@ def predictions(self): return self._call_java("predictions") -class DecisionTreeParams(Params): +class DecisionTreeRegressorParams(DecisionTreeParams, TreeRegressorParams, HasVarianceCol): """ - Mixin for Decision Tree parameters. - """ - - leafCol = Param(Params._dummy(), "leafCol", "Leaf indices column name. Predicted leaf " + - "index of each instance in each tree by preorder.", - typeConverter=TypeConverters.toString) - - maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., " + - "depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.", - typeConverter=TypeConverters.toInt) - - maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous " + - "features. Must be >=2 and >= number of categories for any categorical " + - "feature.", typeConverter=TypeConverters.toInt) - - minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of " + - "instances each child must have after split. If a split causes " + - "the left or right child to have fewer than " + - "minInstancesPerNode, the split will be discarded as invalid. " + - "Should be >= 1.", typeConverter=TypeConverters.toInt) - - minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split " + - "to be considered at a tree node.", typeConverter=TypeConverters.toFloat) - - maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to " + - "histogram aggregation. If too small, then 1 node will be split per " + - "iteration, and its aggregates may exceed this size.", - typeConverter=TypeConverters.toInt) - - cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass " + - "trees to executors to match instances with nodes. If true, the " + - "algorithm will cache node IDs for each instance. Caching can speed " + - "up training of deeper trees. Users can set how often should the cache " + - "be checkpointed or disable it by setting checkpointInterval.", - typeConverter=TypeConverters.toBoolean) - - def __init__(self): - super(DecisionTreeParams, self).__init__() - - def setLeafCol(self, value): - """ - Sets the value of :py:attr:`leafCol`. - """ - return self._set(leafCol=value) - - def getLeafCol(self): - """ - Gets the value of leafCol or its default value. - """ - return self.getOrDefault(self.leafCol) - - def getMaxDepth(self): - """ - Gets the value of maxDepth or its default value. - """ - return self.getOrDefault(self.maxDepth) - - def getMaxBins(self): - """ - Gets the value of maxBins or its default value. - """ - return self.getOrDefault(self.maxBins) - - def getMinInstancesPerNode(self): - """ - Gets the value of minInstancesPerNode or its default value. - """ - return self.getOrDefault(self.minInstancesPerNode) - - def getMinInfoGain(self): - """ - Gets the value of minInfoGain or its default value. - """ - return self.getOrDefault(self.minInfoGain) - - def getMaxMemoryInMB(self): - """ - Gets the value of maxMemoryInMB or its default value. - """ - return self.getOrDefault(self.maxMemoryInMB) - - def getCacheNodeIds(self): - """ - Gets the value of cacheNodeIds or its default value. - """ - return self.getOrDefault(self.cacheNodeIds) - - -class TreeEnsembleParams(DecisionTreeParams): - """ - Mixin for Decision Tree-based ensemble algorithms parameters. - """ - - subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " + - "used for learning each decision tree, in range (0, 1].", - typeConverter=TypeConverters.toFloat) - - supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"] - - featureSubsetStrategy = \ - Param(Params._dummy(), "featureSubsetStrategy", - "The number of features to consider for splits at each tree node. Supported " + - "options: 'auto' (choose automatically for task: If numTrees == 1, set to " + - "'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to " + - "'onethird' for regression), 'all' (use all features), 'onethird' (use " + - "1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use " + - "log2(number of features)), 'n' (when n is in the range (0, 1.0], use " + - "n * number of features. When n is in the range (1, number of features), use" + - " n features). default = 'auto'", typeConverter=TypeConverters.toString) - - def __init__(self): - super(TreeEnsembleParams, self).__init__() - - @since("1.4.0") - def getSubsamplingRate(self): - """ - Gets the value of subsamplingRate or its default value. - """ - return self.getOrDefault(self.subsamplingRate) - - @since("1.4.0") - def getFeatureSubsetStrategy(self): - """ - Gets the value of featureSubsetStrategy or its default value. - """ - return self.getOrDefault(self.featureSubsetStrategy) - - -class HasVarianceImpurity(Params): - """ - Private class to track supported impurity measures. - """ - - supportedImpurities = ["variance"] - - impurity = Param(Params._dummy(), "impurity", - "Criterion used for information gain calculation (case-insensitive). " + - "Supported options: " + - ", ".join(supportedImpurities), typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasVarianceImpurity, self).__init__() - - @since("1.4.0") - def getImpurity(self): - """ - Gets the value of impurity or its default value. - """ - return self.getOrDefault(self.impurity) - - -class TreeRegressorParams(HasVarianceImpurity): - pass - - -class RandomForestParams(TreeEnsembleParams): - """ - Private class to track supported random forest parameters. - """ - - numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).", - typeConverter=TypeConverters.toInt) - - def __init__(self): - super(RandomForestParams, self).__init__() - - @since("1.4.0") - def getNumTrees(self): - """ - Gets the value of numTrees or its default value. - """ - return self.getOrDefault(self.numTrees) - - -class GBTParams(TreeEnsembleParams, HasMaxIter, HasStepSize, HasValidationIndicatorCol): - """ - Private class to track supported GBT params. - """ - - stepSize = Param(Params._dummy(), "stepSize", - "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " + - "the contribution of each estimator.", - typeConverter=TypeConverters.toFloat) - - validationTol = Param(Params._dummy(), "validationTol", - "Threshold for stopping early when fit with validation is used. " + - "If the error rate on the validation input changes by less than the " + - "validationTol, then learning will stop early (before `maxIter`). " + - "This parameter is ignored when fit without validation is used.", - typeConverter=TypeConverters.toFloat) - - @since("3.0.0") - def getValidationTol(self): - """ - Gets the value of validationTol or its default value. - """ - return self.getOrDefault(self.validationTol) - - -class GBTRegressorParams(GBTParams, TreeRegressorParams): - """ - Private class to track supported GBTRegressor params. + (Private) Params for DecisionTree Regressor. .. versionadded:: 3.0.0 """ - supportedLossTypes = ["squared", "absolute"] - - lossType = Param(Params._dummy(), "lossType", - "Loss function which GBT tries to minimize (case-insensitive). " + - "Supported options: " + ", ".join(supportedLossTypes), - typeConverter=TypeConverters.toString) - - @since("1.4.0") - def getLossType(self): - """ - Gets the value of lossType or its default value. - """ - return self.getOrDefault(self.lossType) + pass @inherit_doc -class DecisionTreeRegressor(JavaPredictor, HasWeightCol, DecisionTreeParams, TreeRegressorParams, - HasCheckpointInterval, HasSeed, JavaMLWritable, JavaMLReadable, - HasVarianceCol): +class DecisionTreeRegressor(JavaPredictor, DecisionTreeRegressorParams, JavaMLWritable, + JavaMLReadable): """ `Decision tree `_ learning algorithm for regression. @@ -827,8 +616,12 @@ class DecisionTreeRegressor(JavaPredictor, HasWeightCol, DecisionTreeParams, Tre >>> df = spark.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance", leafCol="leafId") + >>> dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance") >>> model = dt.fit(df) + >>> model.getVarianceCol() + 'variance' + >>> model.setLeafCol("leafId") + DecisionTreeRegressionModel... >>> model.depth 1 >>> model.numNodes @@ -843,6 +636,8 @@ class DecisionTreeRegressor(JavaPredictor, HasWeightCol, DecisionTreeParams, Tre >>> result = model.transform(test0).head() >>> result.prediction 0.0 + >>> model.predictLeaf(test0.head().features) + 0.0 >>> result.leafId 0.0 >>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) @@ -879,20 +674,21 @@ class DecisionTreeRegressor(JavaPredictor, HasWeightCol, DecisionTreeParams, Tre def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", - seed=None, varianceCol=None, weightCol=None, leafCol=""): + seed=None, varianceCol=None, weightCol=None, leafCol="", + minWeightFractionPerNode=0.0): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ impurity="variance", seed=None, varianceCol=None, weightCol=None, \ - leafCol="") + leafCol="", minWeightFractionPerNode=0.0) """ super(DecisionTreeRegressor, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.DecisionTreeRegressor", self.uid) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="variance", leafCol="") + impurity="variance", leafCol="", minWeightFractionPerNode=0.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -902,13 +698,13 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", seed=None, varianceCol=None, weightCol=None, - leafCol=""): + leafCol="", minWeightFractionPerNode=0.0): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ impurity="variance", seed=None, varianceCol=None, weightCol=None, \ - leafCol="") + leafCol="", minWeightFractionPerNode=0.0) Sets params for the DecisionTreeRegressor. """ kwargs = self._input_kwargs @@ -935,6 +731,13 @@ def setMinInstancesPerNode(self, value): """ return self._set(minInstancesPerNode=value) + @since("3.0.0") + def setMinWeightFractionPerNode(self, value): + """ + Sets the value of :py:attr:`minWeightFractionPerNode`. + """ + return self._set(minWeightFractionPerNode=value) + def setMinInfoGain(self, value): """ Sets the value of :py:attr:`minInfoGain`. @@ -962,79 +765,8 @@ def setImpurity(self, value): @inherit_doc -class DecisionTreeModel(JavaPredictionModel): - """ - Abstraction for Decision Tree models. - - .. versionadded:: 1.5.0 - """ - - @property - @since("1.5.0") - def numNodes(self): - """Return number of nodes of the decision tree.""" - return self._call_java("numNodes") - - @property - @since("1.5.0") - def depth(self): - """Return depth of the decision tree.""" - return self._call_java("depth") - - @property - @since("2.0.0") - def toDebugString(self): - """Full description of model.""" - return self._call_java("toDebugString") - - def __repr__(self): - return self._call_java("toString") - - -@inherit_doc -class TreeEnsembleModel(JavaModel): - """ - (private abstraction) - - Represents a tree ensemble model. - """ - - @property - @since("2.0.0") - def trees(self): - """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeModel(m) for m in list(self._call_java("trees"))] - - @property - @since("2.0.0") - def getNumTrees(self): - """Number of trees in ensemble.""" - return self._call_java("getNumTrees") - - @property - @since("1.5.0") - def treeWeights(self): - """Return the weights for each tree""" - return list(self._call_java("javaTreeWeights")) - - @property - @since("2.0.0") - def totalNumNodes(self): - """Total number of nodes, summed over all trees in the ensemble.""" - return self._call_java("totalNumNodes") - - @property - @since("2.0.0") - def toDebugString(self): - """Full description of model.""" - return self._call_java("toDebugString") - - def __repr__(self): - return self._call_java("toString") - - -@inherit_doc -class DecisionTreeRegressionModel(DecisionTreeModel, JavaMLWritable, JavaMLReadable): +class DecisionTreeRegressionModel(DecisionTreeModel, DecisionTreeRegressorParams, + JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`DecisionTreeRegressor`. @@ -1063,9 +795,17 @@ def featureImportances(self): return self._call_java("featureImportances") +class RandomForestRegressorParams(RandomForestParams, TreeRegressorParams): + """ + (Private) Params for RandomForest Regressor. + .. versionadded:: 3.0.0 + """ + pass + + @inherit_doc -class RandomForestRegressor(JavaPredictor, HasSeed, RandomForestParams, TreeRegressorParams, - HasCheckpointInterval, JavaMLWritable, JavaMLReadable): +class RandomForestRegressor(JavaPredictor, RandomForestRegressorParams, JavaMLWritable, + JavaMLReadable): """ `Random Forest `_ learning algorithm for regression. @@ -1076,8 +816,12 @@ class RandomForestRegressor(JavaPredictor, HasSeed, RandomForestParams, TreeRegr >>> df = spark.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42, leafCol="leafId") + >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42) >>> model = rf.fit(df) + >>> model.getSeed() + 42 + >>> model.setLeafCol("leafId") + RandomForestRegressionModel... >>> model.featureImportances SparseVector(1, {0: 1.0}) >>> allclose(model.treeWeights, [1.0, 1.0]) @@ -1085,6 +829,8 @@ class RandomForestRegressor(JavaPredictor, HasSeed, RandomForestParams, TreeRegr >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.predict(test0.head().features) 0.0 + >>> model.predictLeaf(test0.head().features) + DenseVector([0.0, 0.0]) >>> result = model.transform(test0).head() >>> result.prediction 0.0 @@ -1118,13 +864,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, - featureSubsetStrategy="auto", leafCol=""): + featureSubsetStrategy="auto", leafCol="", minWeightFractionPerNode=0.0): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \ - featureSubsetStrategy="auto", leafCol="") + featureSubsetStrategy="auto", leafCol=", minWeightFractionPerNode=0.0") """ super(RandomForestRegressor, self).__init__() self._java_obj = self._new_java_obj( @@ -1132,7 +878,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", subsamplingRate=1.0, numTrees=20, - featureSubsetStrategy="auto", leafCol="") + featureSubsetStrategy="auto", leafCol="", minWeightFractionPerNode=0.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1142,13 +888,13 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, - featureSubsetStrategy="auto", leafCol=""): + featureSubsetStrategy="auto", leafCol="", minWeightFractionPerNode=0.0): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \ - featureSubsetStrategy="auto", leafCol="") + featureSubsetStrategy="auto", leafCol="", minWeightFractionPerNode=0.0) Sets params for linear regression. """ kwargs = self._input_kwargs @@ -1222,8 +968,8 @@ def setFeatureSubsetStrategy(self, value): return self._set(featureSubsetStrategy=value) -class RandomForestRegressionModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, - JavaMLReadable): +class RandomForestRegressionModel(TreeEnsembleModel, RandomForestRegressorParams, + JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`RandomForestRegressor`. @@ -1252,9 +998,29 @@ def featureImportances(self): return self._call_java("featureImportances") +class GBTRegressorParams(GBTParams, TreeRegressorParams): + """ + (Private) class to track supported GBTRegressor params. + .. versionadded:: 3.0.0 + """ + + supportedLossTypes = ["squared", "absolute"] + + lossType = Param(Params._dummy(), "lossType", + "Loss function which GBT tries to minimize (case-insensitive). " + + "Supported options: " + ", ".join(supportedLossTypes), + typeConverter=TypeConverters.toString) + + @since("1.4.0") + def getLossType(self): + """ + Gets the value of lossType or its default value. + """ + return self.getOrDefault(self.lossType) + + @inherit_doc -class GBTRegressor(JavaPredictor, GBTRegressorParams, HasCheckpointInterval, HasSeed, - JavaMLWritable, JavaMLReadable): +class GBTRegressor(JavaPredictor, GBTRegressorParams, JavaMLWritable, JavaMLReadable): """ `Gradient-Boosted Trees (GBTs) `_ learning algorithm for regression. @@ -1271,8 +1037,6 @@ class GBTRegressor(JavaPredictor, GBTRegressorParams, HasCheckpointInterval, Has >>> print(gbt.getFeatureSubsetStrategy()) all >>> model = gbt.fit(df) - >>> model.setFeaturesCol("features") - GBTRegressionModel... >>> model.featureImportances SparseVector(1, {0: 1.0}) >>> model.numFeatures @@ -1282,6 +1046,8 @@ class GBTRegressor(JavaPredictor, GBTRegressorParams, HasCheckpointInterval, Has >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.predict(test0.head().features) 0.0 + >>> model.predictLeaf(test0.head().features) + DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]) >>> result = model.transform(test0).head() >>> result.prediction 0.0 @@ -1323,14 +1089,14 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, impurity="variance", featureSubsetStrategy="all", validationTol=0.01, - validationIndicatorCol=None, leafCol=""): + validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, \ impurity="variance", featureSubsetStrategy="all", validationTol=0.01, \ - validationIndicatorCol=None, leafCol="") + validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0) """ super(GBTRegressor, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid) @@ -1338,7 +1104,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, impurity="variance", featureSubsetStrategy="all", validationTol=0.01, - leafCol="") + leafCol="", minWeightFractionPerNode=0.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1349,14 +1115,14 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, impuriy="variance", featureSubsetStrategy="all", validationTol=0.01, - validationIndicatorCol=None, leafCol=""): + validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, \ impurity="variance", featureSubsetStrategy="all", validationTol=0.01, \ - validationIndicatorCol=None, leafCol="") + validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0) Sets params for Gradient Boosted Tree Regression. """ kwargs = self._input_kwargs @@ -1437,7 +1203,8 @@ def setValidationIndicatorCol(self, value): return self._set(validationIndicatorCol=value) -class GBTRegressionModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable): +class GBTRegressionModel(TreeEnsembleModel, GBTRegressorParams, + JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`GBTRegressor`. diff --git a/python/pyspark/ml/tree.py b/python/pyspark/ml/tree.py new file mode 100644 index 0000000000000..488d73bf36665 --- /dev/null +++ b/python/pyspark/ml/tree.py @@ -0,0 +1,348 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark import since, keyword_only +from pyspark.ml.param.shared import * +from pyspark.ml.util import * +from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \ + JavaPredictor, JavaPredictionModel +from pyspark.ml.common import inherit_doc, _java2py, _py2java + + +@inherit_doc +class DecisionTreeModel(JavaPredictionModel): + """ + Abstraction for Decision Tree models. + .. versionadded:: 1.5.0 + """ + + @property + @since("1.5.0") + def numNodes(self): + """Return number of nodes of the decision tree.""" + return self._call_java("numNodes") + + @property + @since("1.5.0") + def depth(self): + """Return depth of the decision tree.""" + return self._call_java("depth") + + @property + @since("2.0.0") + def toDebugString(self): + """Full description of model.""" + return self._call_java("toDebugString") + + @since("3.0.0") + def predictLeaf(self, value): + """ + Predict the indices of the leaves corresponding to the feature vector. + """ + return self._call_java("predictLeaf", value) + + def __repr__(self): + return self._call_java("toString") + + +class DecisionTreeParams(HasCheckpointInterval, HasSeed, HasWeightCol): + """ + Mixin for Decision Tree parameters. + """ + + leafCol = Param(Params._dummy(), "leafCol", "Leaf indices column name. Predicted leaf " + + "index of each instance in each tree by preorder.", + typeConverter=TypeConverters.toString) + + maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., " + + "depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.", + typeConverter=TypeConverters.toInt) + + maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous " + + "features. Must be >=2 and >= number of categories for any categorical " + + "feature.", typeConverter=TypeConverters.toInt) + + minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of " + + "instances each child must have after split. If a split causes " + + "the left or right child to have fewer than " + + "minInstancesPerNode, the split will be discarded as invalid. " + + "Should be >= 1.", typeConverter=TypeConverters.toInt) + + minWeightFractionPerNode = Param(Params._dummy(), "minWeightFractionPerNode", "Minimum " + "fraction of the weighted sample count that each child " + "must have after split. If a split causes the fraction " + "of the total weight in the left or right child to be " + "less than minWeightFractionPerNode, the split will be " + "discarded as invalid. Should be in interval [0.0, 0.5).", + typeConverter=TypeConverters.toFloat) + + minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split " + + "to be considered at a tree node.", typeConverter=TypeConverters.toFloat) + + maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to " + + "histogram aggregation. If too small, then 1 node will be split per " + + "iteration, and its aggregates may exceed this size.", + typeConverter=TypeConverters.toInt) + + cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass " + + "trees to executors to match instances with nodes. If true, the " + + "algorithm will cache node IDs for each instance. Caching can speed " + + "up training of deeper trees. Users can set how often should the cache " + + "be checkpointed or disable it by setting checkpointInterval.", + typeConverter=TypeConverters.toBoolean) + + def __init__(self): + super(DecisionTreeParams, self).__init__() + + def setLeafCol(self, value): + """ + Sets the value of :py:attr:`leafCol`. + """ + return self._set(leafCol=value) + + def getLeafCol(self): + """ + Gets the value of leafCol or its default value. + """ + return self.getOrDefault(self.leafCol) + + def getMaxDepth(self): + """ + Gets the value of maxDepth or its default value. + """ + return self.getOrDefault(self.maxDepth) + + def getMaxBins(self): + """ + Gets the value of maxBins or its default value. + """ + return self.getOrDefault(self.maxBins) + + def getMinInstancesPerNode(self): + """ + Gets the value of minInstancesPerNode or its default value. + """ + return self.getOrDefault(self.minInstancesPerNode) + + def getMinWeightFractionPerNode(self): + """ + Gets the value of minWeightFractionPerNode or its default value. + """ + return self.getOrDefault(self.minWeightFractionPerNode) + + def getMinInfoGain(self): + """ + Gets the value of minInfoGain or its default value. + """ + return self.getOrDefault(self.minInfoGain) + + def getMaxMemoryInMB(self): + """ + Gets the value of maxMemoryInMB or its default value. + """ + return self.getOrDefault(self.maxMemoryInMB) + + def getCacheNodeIds(self): + """ + Gets the value of cacheNodeIds or its default value. + """ + return self.getOrDefault(self.cacheNodeIds) + + +@inherit_doc +class TreeEnsembleModel(JavaPredictionModel): + """ + (private abstraction) + Represents a tree ensemble model. + """ + + @property + @since("2.0.0") + def trees(self): + """Trees in this ensemble. Warning: These have null parent Estimators.""" + return [DecisionTreeModel(m) for m in list(self._call_java("trees"))] + + @property + @since("2.0.0") + def getNumTrees(self): + """Number of trees in ensemble.""" + return self._call_java("getNumTrees") + + @property + @since("1.5.0") + def treeWeights(self): + """Return the weights for each tree""" + return list(self._call_java("javaTreeWeights")) + + @property + @since("2.0.0") + def totalNumNodes(self): + """Total number of nodes, summed over all trees in the ensemble.""" + return self._call_java("totalNumNodes") + + @property + @since("2.0.0") + def toDebugString(self): + """Full description of model.""" + return self._call_java("toDebugString") + + @since("3.0.0") + def predictLeaf(self, value): + """ + Predict the indices of the leaves corresponding to the feature vector. + """ + return self._call_java("predictLeaf", value) + + def __repr__(self): + return self._call_java("toString") + + +class TreeEnsembleParams(DecisionTreeParams): + """ + Mixin for Decision Tree-based ensemble algorithms parameters. + """ + + subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " + + "used for learning each decision tree, in range (0, 1].", + typeConverter=TypeConverters.toFloat) + + supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"] + + featureSubsetStrategy = \ + Param(Params._dummy(), "featureSubsetStrategy", + "The number of features to consider for splits at each tree node. Supported " + + "options: 'auto' (choose automatically for task: If numTrees == 1, set to " + + "'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to " + + "'onethird' for regression), 'all' (use all features), 'onethird' (use " + + "1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use " + + "log2(number of features)), 'n' (when n is in the range (0, 1.0], use " + + "n * number of features. When n is in the range (1, number of features), use" + + " n features). default = 'auto'", typeConverter=TypeConverters.toString) + + def __init__(self): + super(TreeEnsembleParams, self).__init__() + + @since("1.4.0") + def getSubsamplingRate(self): + """ + Gets the value of subsamplingRate or its default value. + """ + return self.getOrDefault(self.subsamplingRate) + + @since("1.4.0") + def getFeatureSubsetStrategy(self): + """ + Gets the value of featureSubsetStrategy or its default value. + """ + return self.getOrDefault(self.featureSubsetStrategy) + + +class RandomForestParams(TreeEnsembleParams): + """ + Private class to track supported random forest parameters. + """ + + numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).", + typeConverter=TypeConverters.toInt) + + def __init__(self): + super(RandomForestParams, self).__init__() + + @since("1.4.0") + def getNumTrees(self): + """ + Gets the value of numTrees or its default value. + """ + return self.getOrDefault(self.numTrees) + + +class GBTParams(TreeEnsembleParams, HasMaxIter, HasStepSize, HasValidationIndicatorCol): + """ + Private class to track supported GBT params. + """ + + stepSize = Param(Params._dummy(), "stepSize", + "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " + + "the contribution of each estimator.", + typeConverter=TypeConverters.toFloat) + + validationTol = Param(Params._dummy(), "validationTol", + "Threshold for stopping early when fit with validation is used. " + + "If the error rate on the validation input changes by less than the " + + "validationTol, then learning will stop early (before `maxIter`). " + + "This parameter is ignored when fit without validation is used.", + typeConverter=TypeConverters.toFloat) + + @since("3.0.0") + def getValidationTol(self): + """ + Gets the value of validationTol or its default value. + """ + return self.getOrDefault(self.validationTol) + + +class HasVarianceImpurity(Params): + """ + Private class to track supported impurity measures. + """ + + supportedImpurities = ["variance"] + + impurity = Param(Params._dummy(), "impurity", + "Criterion used for information gain calculation (case-insensitive). " + + "Supported options: " + + ", ".join(supportedImpurities), typeConverter=TypeConverters.toString) + + def __init__(self): + super(HasVarianceImpurity, self).__init__() + + @since("1.4.0") + def getImpurity(self): + """ + Gets the value of impurity or its default value. + """ + return self.getOrDefault(self.impurity) + + +class TreeClassifierParams(object): + """ + Private class to track supported impurity measures. + .. versionadded:: 1.4.0 + """ + supportedImpurities = ["entropy", "gini"] + + impurity = Param(Params._dummy(), "impurity", + "Criterion used for information gain calculation (case-insensitive). " + + "Supported options: " + + ", ".join(supportedImpurities), typeConverter=TypeConverters.toString) + + def __init__(self): + super(TreeClassifierParams, self).__init__() + + @since("1.6.0") + def getImpurity(self): + """ + Gets the value of impurity or its default value. + """ + return self.getOrDefault(self.impurity) + + +class TreeRegressorParams(HasVarianceImpurity): + """ + Private class to track supported impurity measures. + """ + pass From b1c418a794138630c34e84a29996514dab058c15 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 25 Sep 2019 11:52:55 -0700 Subject: [PATCH 2/6] remove JavaProbabilisticClassifierParams from DecisionTreeClassifierParams --- python/pyspark/ml/classification.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 433cb631bacc7..162412b1e4950 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -941,8 +941,7 @@ class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, @inherit_doc -class DecisionTreeClassifierParams(DecisionTreeParams, TreeClassifierParams, - JavaProbabilisticClassifierParams): +class DecisionTreeClassifierParams(DecisionTreeParams, TreeClassifierParams): """ (Private) Params for DecisionTree Classifier. """ From b722aae97c9d45257be675331b95e01826ab5158 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 26 Sep 2019 10:53:26 -0700 Subject: [PATCH 3/6] add _ infront of xxxParams --- python/pyspark/ml/classification.py | 16 ++++++++-------- python/pyspark/ml/regression.py | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 162412b1e4950..52eefe76826c4 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -941,15 +941,15 @@ class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, @inherit_doc -class DecisionTreeClassifierParams(DecisionTreeParams, TreeClassifierParams): +class _DecisionTreeClassifierParams(DecisionTreeParams, TreeClassifierParams): """ - (Private) Params for DecisionTree Classifier. + Params for :py:attr:`DecisionTreeClassifier` and :py:attr:`DecisionTreeClassificationModel`. """ pass @inherit_doc -class DecisionTreeClassifier(JavaProbabilisticClassifier, DecisionTreeClassifierParams, +class DecisionTreeClassifier(JavaProbabilisticClassifier, _DecisionTreeClassifierParams, JavaMLWritable, JavaMLReadable): """ `Decision tree `_ @@ -1121,7 +1121,7 @@ def setImpurity(self, value): @inherit_doc class DecisionTreeClassificationModel(DecisionTreeModel, JavaProbabilisticClassificationModel, - DecisionTreeClassifierParams, JavaMLWritable, + _DecisionTreeClassifierParams, JavaMLWritable, JavaMLReadable): """ Model fitted by DecisionTreeClassifier. @@ -1152,15 +1152,15 @@ def featureImportances(self): @inherit_doc -class RandomForestClassifierParams(RandomForestParams, TreeClassifierParams): +class _RandomForestClassifierParams(RandomForestParams, TreeClassifierParams): """ - (Private) Params for RandomForest Classifier. + Params for :py:attr:`RandomForestClassifier` and :py:attr:`RandomForestClassificationModel`. """ pass @inherit_doc -class RandomForestClassifier(JavaProbabilisticClassifier, RandomForestClassifierParams, +class RandomForestClassifier(JavaProbabilisticClassifier, _RandomForestClassifierParams, JavaMLWritable, JavaMLReadable): """ `Random Forest `_ @@ -1338,7 +1338,7 @@ def setFeatureSubsetStrategy(self, value): class RandomForestClassificationModel(TreeEnsembleModel, JavaProbabilisticClassificationModel, - RandomForestClassifierParams, JavaMLWritable, + _RandomForestClassifierParams, JavaMLWritable, JavaMLReadable): """ Model fitted by RandomForestClassifier. diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 637334d8adc61..0acf4c9c6a555 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -594,9 +594,9 @@ def predictions(self): return self._call_java("predictions") -class DecisionTreeRegressorParams(DecisionTreeParams, TreeRegressorParams, HasVarianceCol): +class _DecisionTreeRegressorParams(DecisionTreeParams, TreeRegressorParams, HasVarianceCol): """ - (Private) Params for DecisionTree Regressor. + Params for :py:attr:`DecisionTreeRegressor` and :py:attr:`DecisionTreeRegressionModel`. .. versionadded:: 3.0.0 """ @@ -605,7 +605,7 @@ class DecisionTreeRegressorParams(DecisionTreeParams, TreeRegressorParams, HasVa @inherit_doc -class DecisionTreeRegressor(JavaPredictor, DecisionTreeRegressorParams, JavaMLWritable, +class DecisionTreeRegressor(JavaPredictor, _DecisionTreeRegressorParams, JavaMLWritable, JavaMLReadable): """ `Decision tree `_ @@ -765,7 +765,7 @@ def setImpurity(self, value): @inherit_doc -class DecisionTreeRegressionModel(DecisionTreeModel, DecisionTreeRegressorParams, +class DecisionTreeRegressionModel(DecisionTreeModel, _DecisionTreeRegressorParams, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`DecisionTreeRegressor`. @@ -795,16 +795,16 @@ def featureImportances(self): return self._call_java("featureImportances") -class RandomForestRegressorParams(RandomForestParams, TreeRegressorParams): +class _RandomForestRegressorParams(RandomForestParams, TreeRegressorParams): """ - (Private) Params for RandomForest Regressor. + Params for :py:attr:`RandomForestRegressor` and :py:attr:`RandomForestRegressionModel`. .. versionadded:: 3.0.0 """ pass @inherit_doc -class RandomForestRegressor(JavaPredictor, RandomForestRegressorParams, JavaMLWritable, +class RandomForestRegressor(JavaPredictor, _RandomForestRegressorParams, JavaMLWritable, JavaMLReadable): """ `Random Forest `_ @@ -968,7 +968,7 @@ def setFeatureSubsetStrategy(self, value): return self._set(featureSubsetStrategy=value) -class RandomForestRegressionModel(TreeEnsembleModel, RandomForestRegressorParams, +class RandomForestRegressionModel(TreeEnsembleModel, _RandomForestRegressorParams, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`RandomForestRegressor`. From 61f318bc94a9913eddc24818ffa164821b2ebde3 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 3 Oct 2019 15:03:03 -0700 Subject: [PATCH 4/6] change :py:attr: to :py:attr:class for classes --- python/pyspark/ml/classification.py | 4 ++-- python/pyspark/ml/regression.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 52eefe76826c4..e3010f8941b3c 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -943,7 +943,7 @@ class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, @inherit_doc class _DecisionTreeClassifierParams(DecisionTreeParams, TreeClassifierParams): """ - Params for :py:attr:`DecisionTreeClassifier` and :py:attr:`DecisionTreeClassificationModel`. + Params for :py:class:`DecisionTreeClassifier` and :py:class:`DecisionTreeClassificationModel`. """ pass @@ -1154,7 +1154,7 @@ def featureImportances(self): @inherit_doc class _RandomForestClassifierParams(RandomForestParams, TreeClassifierParams): """ - Params for :py:attr:`RandomForestClassifier` and :py:attr:`RandomForestClassificationModel`. + Params for :py:class:`RandomForestClassifier` and :py:class:`RandomForestClassificationModel`. """ pass diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 0acf4c9c6a555..645a03129e47f 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -596,7 +596,7 @@ def predictions(self): class _DecisionTreeRegressorParams(DecisionTreeParams, TreeRegressorParams, HasVarianceCol): """ - Params for :py:attr:`DecisionTreeRegressor` and :py:attr:`DecisionTreeRegressionModel`. + Params for :py:class:`DecisionTreeRegressor` and :py:class:`DecisionTreeRegressionModel`. .. versionadded:: 3.0.0 """ @@ -797,7 +797,7 @@ def featureImportances(self): class _RandomForestRegressorParams(RandomForestParams, TreeRegressorParams): """ - Params for :py:attr:`RandomForestRegressor` and :py:attr:`RandomForestRegressionModel`. + Params for :py:class:`RandomForestRegressor` and :py:class:`RandomForestRegressionModel`. .. versionadded:: 3.0.0 """ pass From 7034a5e733f62d4227fb85ee1ea5e10a0795da4a Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 3 Oct 2019 15:18:50 -0700 Subject: [PATCH 5/6] add one more line before .. versionadded --- python/pyspark/ml/regression.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 645a03129e47f..65321f66e04d2 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -798,6 +798,7 @@ def featureImportances(self): class _RandomForestRegressorParams(RandomForestParams, TreeRegressorParams): """ Params for :py:class:`RandomForestRegressor` and :py:class:`RandomForestRegressionModel`. + .. versionadded:: 3.0.0 """ pass From 302e98eb8fc9cd8e776b7b6d8507f256ac6b31a2 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 8 Oct 2019 23:03:49 -0700 Subject: [PATCH 6/6] address comments --- python/pyspark/ml/classification.py | 18 ++++++++-------- python/pyspark/ml/regression.py | 24 ++++++++++----------- python/pyspark/ml/tree.py | 33 ++++++++++++++++------------- 3 files changed, 39 insertions(+), 36 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index e3010f8941b3c..d18cfa13aa898 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -22,9 +22,9 @@ from pyspark import since, keyword_only from pyspark.ml import Estimator, Model from pyspark.ml.param.shared import * -from pyspark.ml.tree import DecisionTreeModel, DecisionTreeParams, \ - TreeEnsembleModel, RandomForestParams, GBTParams, \ - HasVarianceImpurity, TreeClassifierParams, TreeEnsembleParams +from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \ + _TreeEnsembleModel, _RandomForestParams, _GBTParams, \ + _HasVarianceImpurity, _TreeClassifierParams, _TreeEnsembleParams from pyspark.ml.regression import DecisionTreeRegressionModel from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \ @@ -941,7 +941,7 @@ class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, @inherit_doc -class _DecisionTreeClassifierParams(DecisionTreeParams, TreeClassifierParams): +class _DecisionTreeClassifierParams(_DecisionTreeParams, _TreeClassifierParams): """ Params for :py:class:`DecisionTreeClassifier` and :py:class:`DecisionTreeClassificationModel`. """ @@ -1120,7 +1120,7 @@ def setImpurity(self, value): @inherit_doc -class DecisionTreeClassificationModel(DecisionTreeModel, JavaProbabilisticClassificationModel, +class DecisionTreeClassificationModel(_DecisionTreeModel, JavaProbabilisticClassificationModel, _DecisionTreeClassifierParams, JavaMLWritable, JavaMLReadable): """ @@ -1152,7 +1152,7 @@ def featureImportances(self): @inherit_doc -class _RandomForestClassifierParams(RandomForestParams, TreeClassifierParams): +class _RandomForestClassifierParams(_RandomForestParams, _TreeClassifierParams): """ Params for :py:class:`RandomForestClassifier` and :py:class:`RandomForestClassificationModel`. """ @@ -1337,7 +1337,7 @@ def setFeatureSubsetStrategy(self, value): return self._set(featureSubsetStrategy=value) -class RandomForestClassificationModel(TreeEnsembleModel, JavaProbabilisticClassificationModel, +class RandomForestClassificationModel(_TreeEnsembleModel, JavaProbabilisticClassificationModel, _RandomForestClassifierParams, JavaMLWritable, JavaMLReadable): """ @@ -1368,7 +1368,7 @@ def trees(self): return [DecisionTreeClassificationModel(m) for m in list(self._call_java("trees"))] -class GBTClassifierParams(GBTParams, HasVarianceImpurity): +class GBTClassifierParams(_GBTParams, _HasVarianceImpurity): """ Private class to track supported GBTClassifier params. @@ -1601,7 +1601,7 @@ def setValidationIndicatorCol(self, value): return self._set(validationIndicatorCol=value) -class GBTClassificationModel(TreeEnsembleModel, JavaProbabilisticClassificationModel, +class GBTClassificationModel(_TreeEnsembleModel, JavaProbabilisticClassificationModel, GBTClassifierParams, JavaMLWritable, JavaMLReadable): """ Model fitted by GBTClassifier. diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 65321f66e04d2..1896511ecf559 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -19,9 +19,9 @@ from pyspark import since, keyword_only from pyspark.ml.param.shared import * -from pyspark.ml.tree import DecisionTreeModel, DecisionTreeParams, \ - TreeEnsembleModel, TreeEnsembleParams, RandomForestParams, GBTParams, \ - HasVarianceImpurity, TreeRegressorParams +from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \ + _TreeEnsembleModel, _TreeEnsembleParams, _RandomForestParams, _GBTParams, \ + _HasVarianceImpurity, _TreeRegressorParams from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \ JavaPredictor, JavaPredictionModel, JavaWrapper @@ -594,7 +594,7 @@ def predictions(self): return self._call_java("predictions") -class _DecisionTreeRegressorParams(DecisionTreeParams, TreeRegressorParams, HasVarianceCol): +class _DecisionTreeRegressorParams(_DecisionTreeParams, _TreeRegressorParams, HasVarianceCol): """ Params for :py:class:`DecisionTreeRegressor` and :py:class:`DecisionTreeRegressionModel`. @@ -765,7 +765,7 @@ def setImpurity(self, value): @inherit_doc -class DecisionTreeRegressionModel(DecisionTreeModel, _DecisionTreeRegressorParams, +class DecisionTreeRegressionModel(_DecisionTreeModel, _DecisionTreeRegressorParams, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`DecisionTreeRegressor`. @@ -795,7 +795,7 @@ def featureImportances(self): return self._call_java("featureImportances") -class _RandomForestRegressorParams(RandomForestParams, TreeRegressorParams): +class _RandomForestRegressorParams(_RandomForestParams, _TreeRegressorParams): """ Params for :py:class:`RandomForestRegressor` and :py:class:`RandomForestRegressionModel`. @@ -969,7 +969,7 @@ def setFeatureSubsetStrategy(self, value): return self._set(featureSubsetStrategy=value) -class RandomForestRegressionModel(TreeEnsembleModel, _RandomForestRegressorParams, +class RandomForestRegressionModel(_TreeEnsembleModel, _RandomForestRegressorParams, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`RandomForestRegressor`. @@ -999,9 +999,10 @@ def featureImportances(self): return self._call_java("featureImportances") -class GBTRegressorParams(GBTParams, TreeRegressorParams): +class _GBTRegressorParams(_GBTParams, _TreeRegressorParams): """ - (Private) class to track supported GBTRegressor params. + Params for :py:class:`GBTRegressor` and :py:class:`GBTRegressorModel`. + .. versionadded:: 3.0.0 """ @@ -1021,7 +1022,7 @@ def getLossType(self): @inherit_doc -class GBTRegressor(JavaPredictor, GBTRegressorParams, JavaMLWritable, JavaMLReadable): +class GBTRegressor(JavaPredictor, _GBTRegressorParams, JavaMLWritable, JavaMLReadable): """ `Gradient-Boosted Trees (GBTs) `_ learning algorithm for regression. @@ -1204,8 +1205,7 @@ def setValidationIndicatorCol(self, value): return self._set(validationIndicatorCol=value) -class GBTRegressionModel(TreeEnsembleModel, GBTRegressorParams, - JavaMLWritable, JavaMLReadable): +class GBTRegressionModel(_TreeEnsembleModel, _GBTRegressorParams, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`GBTRegressor`. diff --git a/python/pyspark/ml/tree.py b/python/pyspark/ml/tree.py index 488d73bf36665..f38a7375c2c54 100644 --- a/python/pyspark/ml/tree.py +++ b/python/pyspark/ml/tree.py @@ -24,9 +24,10 @@ @inherit_doc -class DecisionTreeModel(JavaPredictionModel): +class _DecisionTreeModel(JavaPredictionModel): """ Abstraction for Decision Tree models. + .. versionadded:: 1.5.0 """ @@ -59,7 +60,7 @@ def __repr__(self): return self._call_java("toString") -class DecisionTreeParams(HasCheckpointInterval, HasSeed, HasWeightCol): +class _DecisionTreeParams(HasCheckpointInterval, HasSeed, HasWeightCol): """ Mixin for Decision Tree parameters. """ @@ -106,7 +107,7 @@ class DecisionTreeParams(HasCheckpointInterval, HasSeed, HasWeightCol): typeConverter=TypeConverters.toBoolean) def __init__(self): - super(DecisionTreeParams, self).__init__() + super(_DecisionTreeParams, self).__init__() def setLeafCol(self, value): """ @@ -164,7 +165,7 @@ def getCacheNodeIds(self): @inherit_doc -class TreeEnsembleModel(JavaPredictionModel): +class _TreeEnsembleModel(JavaPredictionModel): """ (private abstraction) Represents a tree ensemble model. @@ -174,7 +175,7 @@ class TreeEnsembleModel(JavaPredictionModel): @since("2.0.0") def trees(self): """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeModel(m) for m in list(self._call_java("trees"))] + return [_DecisionTreeModel(m) for m in list(self._call_java("trees"))] @property @since("2.0.0") @@ -211,7 +212,7 @@ def __repr__(self): return self._call_java("toString") -class TreeEnsembleParams(DecisionTreeParams): +class _TreeEnsembleParams(_DecisionTreeParams): """ Mixin for Decision Tree-based ensemble algorithms parameters. """ @@ -234,7 +235,7 @@ class TreeEnsembleParams(DecisionTreeParams): " n features). default = 'auto'", typeConverter=TypeConverters.toString) def __init__(self): - super(TreeEnsembleParams, self).__init__() + super(_TreeEnsembleParams, self).__init__() @since("1.4.0") def getSubsamplingRate(self): @@ -251,7 +252,7 @@ def getFeatureSubsetStrategy(self): return self.getOrDefault(self.featureSubsetStrategy) -class RandomForestParams(TreeEnsembleParams): +class _RandomForestParams(_TreeEnsembleParams): """ Private class to track supported random forest parameters. """ @@ -260,7 +261,7 @@ class RandomForestParams(TreeEnsembleParams): typeConverter=TypeConverters.toInt) def __init__(self): - super(RandomForestParams, self).__init__() + super(_RandomForestParams, self).__init__() @since("1.4.0") def getNumTrees(self): @@ -270,7 +271,7 @@ def getNumTrees(self): return self.getOrDefault(self.numTrees) -class GBTParams(TreeEnsembleParams, HasMaxIter, HasStepSize, HasValidationIndicatorCol): +class _GBTParams(_TreeEnsembleParams, HasMaxIter, HasStepSize, HasValidationIndicatorCol): """ Private class to track supported GBT params. """ @@ -295,7 +296,7 @@ def getValidationTol(self): return self.getOrDefault(self.validationTol) -class HasVarianceImpurity(Params): +class _HasVarianceImpurity(Params): """ Private class to track supported impurity measures. """ @@ -308,7 +309,7 @@ class HasVarianceImpurity(Params): ", ".join(supportedImpurities), typeConverter=TypeConverters.toString) def __init__(self): - super(HasVarianceImpurity, self).__init__() + super(_HasVarianceImpurity, self).__init__() @since("1.4.0") def getImpurity(self): @@ -318,11 +319,13 @@ def getImpurity(self): return self.getOrDefault(self.impurity) -class TreeClassifierParams(object): +class _TreeClassifierParams(object): """ Private class to track supported impurity measures. + .. versionadded:: 1.4.0 """ + supportedImpurities = ["entropy", "gini"] impurity = Param(Params._dummy(), "impurity", @@ -331,7 +334,7 @@ class TreeClassifierParams(object): ", ".join(supportedImpurities), typeConverter=TypeConverters.toString) def __init__(self): - super(TreeClassifierParams, self).__init__() + super(_TreeClassifierParams, self).__init__() @since("1.6.0") def getImpurity(self): @@ -341,7 +344,7 @@ def getImpurity(self): return self.getOrDefault(self.impurity) -class TreeRegressorParams(HasVarianceImpurity): +class _TreeRegressorParams(_HasVarianceImpurity): """ Private class to track supported impurity measures. """