From a67163e285876651dbb7eeb1d0ee66aa30dbce31 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 23 Oct 2019 11:36:16 -0700 Subject: [PATCH 1/4] [SPARK-29093][PYTHON][ML] Remove automatically generated param setters in _shared_params_code_gen.py --- .../apache/spark/ml/feature/MinHashLSH.scala | 14 - .../ml/feature/QuantileDiscretizer.scala | 5 +- python/pyspark/ml/base.py | 12 + python/pyspark/ml/classification.py | 324 +++++++- python/pyspark/ml/clustering.py | 254 +++++- python/pyspark/ml/evaluation.py | 128 ++- python/pyspark/ml/feature.py | 758 +++++++++++++++++- python/pyspark/ml/fpm.py | 13 + .../ml/param/_shared_params_code_gen.py | 6 - python/pyspark/ml/param/shared.py | 186 ----- python/pyspark/ml/recommendation.py | 48 +- python/pyspark/ml/regression.py | 358 ++++++++- python/pyspark/ml/tests/test_param.py | 12 +- python/pyspark/ml/tuning.py | 36 + 14 files changed, 1891 insertions(+), 263 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala index da0eaad667ccb..a56aae65dd151 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala @@ -48,14 +48,6 @@ class MinHashLSHModel private[ml]( private[ml] val randCoefficients: Array[(Int, Int)]) extends LSHModel[MinHashLSHModel] { - /** @group setParam */ - @Since("2.4.0") - override def setInputCol(value: String): this.type = super.set(inputCol, value) - - /** @group setParam */ - @Since("2.4.0") - override def setOutputCol(value: String): this.type = super.set(outputCol, value) - @Since("2.1.0") override protected[ml] def hashFunction(elems: Vector): Array[Vector] = { require(elems.numNonzeros > 0, "Must have at least 1 non zero entry.") @@ -113,12 +105,6 @@ class MinHashLSHModel private[ml]( @Since("2.1.0") class MinHashLSH(override val uid: String) extends LSH[MinHashLSHModel] with HasSeed { - @Since("2.1.0") - override def setInputCol(value: String): this.type = super.setInputCol(value) - - @Since("2.1.0") - override def setOutputCol(value: String): this.type = super.setOutputCol(value) - @Since("2.1.0") override def setNumHashTables(value: Int): this.type = super.setNumHashTables(value) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index aa4ab5903f711..eb78d8224fc3f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql.types.StructType * Params for [[QuantileDiscretizer]]. */ private[feature] trait QuantileDiscretizerBase extends Params - with HasHandleInvalid with HasInputCol with HasOutputCol { + with HasHandleInvalid with HasInputCol with HasOutputCol with HasInputCols with HasOutputCols { /** * Number of buckets (quantiles, or categories) into which data points are grouped. Must @@ -129,8 +129,7 @@ private[feature] trait QuantileDiscretizerBase extends Params */ @Since("1.6.0") final class QuantileDiscretizer @Since("1.6.0") (@Since("1.6.0") override val uid: String) - extends Estimator[Bucketizer] with QuantileDiscretizerBase with DefaultParamsWritable - with HasInputCols with HasOutputCols { + extends Estimator[Bucketizer] with QuantileDiscretizerBase with DefaultParamsWritable { @Since("1.6.0") def this() = this(Identifiable.randomUID("quantileDiscretizer")) diff --git a/python/pyspark/ml/base.py b/python/pyspark/ml/base.py index 82ff81c58d3c6..542cb25172ead 100644 --- a/python/pyspark/ml/base.py +++ b/python/pyspark/ml/base.py @@ -194,6 +194,18 @@ class UnaryTransformer(HasInputCol, HasOutputCol, Transformer): .. versionadded:: 2.3.0 """ + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @abstractmethod def createTransformFunc(self): """ diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index d0c821329471f..c5cdf35729dd8 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -177,7 +177,19 @@ class LinearSVC(JavaClassifier, _LinearSVCParams, JavaMLWritable, JavaMLReadable >>> df = sc.parallelize([ ... Row(label=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), ... Row(label=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() - >>> svm = LinearSVC(maxIter=5, regParam=0.01) + >>> svm = LinearSVC() + >>> svm.getMaxIter() + 100 + >>> svm.setMaxIter(5) + LinearSVC... + >>> svm.getMaxIter() + 5 + >>> svm.getRegParam() + 0.0 + >>> svm.setRegParam(0.01) + LinearSVC... + >>> svm.getRegParam() + 0.01 >>> model = svm.fit(df) >>> model.setPredictionCol("newPrediction") LinearSVC... @@ -257,6 +269,62 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre def _create_model(self, java_model): return LinearSVCModel(java_model) + @since("2.2.0") + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + @since("2.2.0") + def setRegParam(self, value): + """ + Sets the value of :py:attr:`regParam`. + """ + return self._set(regParam=value) + + @since("2.2.0") + def setTol(self, value): + """ + Sets the value of :py:attr:`tol`. + """ + return self._set(tol=value) + + @since("2.2.0") + def setFitIntercept(self, value): + """ + Sets the value of :py:attr:`fitIntercept`. + """ + return self._set(fitIntercept=value) + + @since("2.2.0") + def setStandardization(self, value): + """ + Sets the value of :py:attr:`standardization`. + """ + return self._set(standardization=value) + + @since("2.2.0") + def setThreshold(self, value): + """ + Sets the value of :py:attr:`threshold`. + """ + return self._set(threshold=value) + + @since("2.2.0") + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + + @since("2.2.0") + def setAggregationDepth(self, value): + """ + Sets the value of :py:attr:`aggregationDepth`. + """ + return self._set(aggregationDepth=value) + class LinearSVCModel(JavaClassificationModel, _LinearSVCParams, JavaMLWritable, JavaMLReadable): """ @@ -265,6 +333,13 @@ class LinearSVCModel(JavaClassificationModel, _LinearSVCParams, JavaMLWritable, .. versionadded:: 2.2.0 """ + @since("3.0.0") + def setThreshold(self, value): + """ + Sets the value of :py:attr:`threshold`. + """ + return self._set(threshold=value) + @property @since("2.2.0") def coefficients(self): @@ -454,7 +529,18 @@ class LogisticRegression(JavaProbabilisticClassifier, _LogisticRegressionParams, ... Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)), ... Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)), ... Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))]).toDF() - >>> blor = LogisticRegression(regParam=0.01, weightCol="weight") + >>> blor = LogisticRegression(weightCol="weight") + >>> blor.getRegParam() + 0.0 + >>> blor.setRegParam(0.01) + LogisticRegression... + >>> blor.getRegParam() + 0.01 + >>> blor.setMaxIter(10) + LogisticRegression... + >>> blor.getMaxIter() + 10 + >>> blor.clear(blor.maxIter) >>> blorModel = blor.fit(bdf) >>> blorModel.setFeaturesCol("features") LogisticRegressionModel... @@ -603,6 +689,54 @@ def setUpperBoundsOnIntercepts(self, value): """ return self._set(upperBoundsOnIntercepts=value) + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + def setRegParam(self, value): + """ + Sets the value of :py:attr:`regParam`. + """ + return self._set(regParam=value) + + def setTol(self, value): + """ + Sets the value of :py:attr:`tol`. + """ + return self._set(tol=value) + + def setElasticNetParam(self, value): + """ + Sets the value of :py:attr:`elasticNetParam`. + """ + return self._set(elasticNetParam=value) + + def setFitIntercept(self, value): + """ + Sets the value of :py:attr:`fitIntercept`. + """ + return self._set(fitIntercept=value) + + def setStandardization(self, value): + """ + Sets the value of :py:attr:`standardization`. + """ + return self._set(standardization=value) + + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + + def setAggregationDepth(self, value): + """ + Sets the value of :py:attr:`aggregationDepth`. + """ + return self._set(aggregationDepth=value) + class LogisticRegressionModel(JavaProbabilisticClassificationModel, _LogisticRegressionParams, JavaMLWritable, JavaMLReadable, HasTrainingSummary): @@ -1148,6 +1282,27 @@ def setImpurity(self, value): """ return self._set(impurity=value) + @since("1.4.0") + def setCheckpointInterval(self, value): + """ + Sets the value of :py:attr:`checkpointInterval`. + """ + return self._set(checkpointInterval=value) + + @since("1.4.0") + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + @since("3.0.0") + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + @inherit_doc class DecisionTreeClassificationModel(_DecisionTreeModel, JavaProbabilisticClassificationModel, @@ -1366,6 +1521,18 @@ def setFeatureSubsetStrategy(self, value): """ return self._set(featureSubsetStrategy=value) + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + def setCheckpointInterval(self, value): + """ + Sets the value of :py:attr:`checkpointInterval`. + """ + return self._set(checkpointInterval=value) + class RandomForestClassificationModel(_TreeEnsembleModel, JavaProbabilisticClassificationModel, _RandomForestClassifierParams, JavaMLWritable, @@ -1451,6 +1618,10 @@ class GBTClassifier(JavaProbabilisticClassifier, _GBTClassifierParams, >>> td = si_model.transform(df) >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42, ... leafCol="leafId") + >>> gbt.setMaxIter(5) + GBTClassifier... + >>> gbt.getMaxIter() + 5 >>> gbt.getFeatureSubsetStrategy() 'all' >>> model = gbt.fit(td) @@ -1630,6 +1801,34 @@ def setValidationIndicatorCol(self, value): """ return self._set(validationIndicatorCol=value) + @since("1.4.0") + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + @since("1.4.0") + def setCheckpointInterval(self, value): + """ + Sets the value of :py:attr:`checkpointInterval`. + """ + return self._set(checkpointInterval=value) + + @since("1.4.0") + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + @since("1.4.0") + def setStepSize(self, value): + """ + Sets the value of :py:attr:`stepSize`. + """ + return self._set(stepSize=value) + class GBTClassificationModel(_TreeEnsembleModel, JavaProbabilisticClassificationModel, _GBTClassifierParams, JavaMLWritable, JavaMLReadable): @@ -1723,10 +1922,6 @@ class NaiveBayes(JavaProbabilisticClassifier, _NaiveBayesParams, HasThresholds, >>> model = nb.fit(df) >>> model.setFeaturesCol("features") NaiveBayes_... - >>> model.setLabelCol("newLabel") - NaiveBayes_... - >>> model.getLabelCol() - 'newLabel' >>> model.getSmoothing() 1.0 >>> model.pi @@ -1814,6 +2009,12 @@ def setModelType(self, value): """ return self._set(modelType=value) + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + class NaiveBayesModel(JavaProbabilisticClassificationModel, _NaiveBayesParams, JavaMLWritable, JavaMLReadable): @@ -1906,7 +2107,11 @@ class MultilayerPerceptronClassifier(JavaProbabilisticClassifier, _MultilayerPer ... (1.0, Vectors.dense([0.0, 1.0])), ... (1.0, Vectors.dense([1.0, 0.0])), ... (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"]) - >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 2, 2], blockSize=1, seed=123) + >>> mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], blockSize=1, seed=123) + >>> mlp.setMaxIter(100) + MultilayerPerceptronClassifier... + >>> mlp.getMaxIter() + 100 >>> model = mlp.fit(df) >>> model.setFeaturesCol("features") MultilayerPerceptronClassifier... @@ -2000,6 +2205,31 @@ def setBlockSize(self, value): """ return self._set(blockSize=value) + @since("2.0.0") + def setInitialWeights(self, value): + """ + Sets the value of :py:attr:`initialWeights`. + """ + return self._set(initialWeights=value) + + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + def setTol(self, value): + """ + Sets the value of :py:attr:`tol`. + """ + return self._set(tol=value) + @since("2.0.0") def setStepSize(self, value): """ @@ -2007,12 +2237,11 @@ def setStepSize(self, value): """ return self._set(stepSize=value) - @since("2.0.0") - def setInitialWeights(self, value): + def setSolver(self, value): """ - Sets the value of :py:attr:`initialWeights`. + Sets the value of :py:attr:`solver`. """ - return self._set(initialWeights=value) + return self._set(solver=value) class MultilayerPerceptronClassificationModel(JavaProbabilisticClassificationModel, JavaMLWritable, @@ -2134,6 +2363,42 @@ def setClassifier(self, value): """ return self._set(classifier=value) + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + def setRawPredictionCol(self, value): + """ + Sets the value of :py:attr:`rawPredictionCol`. + """ + return self._set(rawPredictionCol=value) + + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + + def setParallelism(self, value): + """ + Sets the value of :py:attr:`parallelism`. + """ + return self._set(parallelism=value) + def _fit(self, dataset): labelCol = self.getLabelCol() featuresCol = self.getFeaturesCol() @@ -2287,6 +2552,43 @@ class OneVsRestModel(Model, _OneVsRestParams, JavaMLReadable, JavaMLWritable): .. versionadded:: 2.0.0 """ + @since("2.0.0") + def setClassifier(self, value): + """ + Sets the value of :py:attr:`classifier`. + """ + return self._set(classifier=value) + + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + def setRawPredictionCol(self, value): + """ + Sets the value of :py:attr:`rawPredictionCol`. + """ + return self._set(rawPredictionCol=value) + + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + def __init__(self, models): super(OneVsRestModel, self).__init__() self.models = models diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index cbbbd36955dc0..bb73dc78c4ab4 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -123,6 +123,27 @@ class GaussianMixtureModel(JavaModel, _GaussianMixtureParams, JavaMLWritable, Ja .. versionadded:: 2.0.0 """ + @since("3.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("3.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + @since("3.0.0") + def setProbabilityCol(self, value): + """ + Sets the value of :py:attr:`probabilityCol`. + """ + return self._set(probabilityCol=value) + @property @since("2.0.0") def weights(self): @@ -200,8 +221,13 @@ class GaussianMixture(JavaEstimator, _GaussianMixtureParams, JavaMLWritable, Jav ... (Vectors.dense([-0.83, -0.68]),), ... (Vectors.dense([-0.91, -0.76]),)] >>> df = spark.createDataFrame(data, ["features"]) - >>> gm = GaussianMixture(k=3, tol=0.0001, - ... maxIter=10, seed=10) + >>> gm = GaussianMixture(k=3, tol=0.0001, seed=10) + >>> gm.getMaxIter() + 100 + >>> gm.setMaxIter(10) + GaussianMixture... + >>> gm.getMaxIter() + 10 >>> model = gm.fit(df) >>> model.getFeaturesCol() 'features' @@ -290,6 +316,48 @@ def setK(self, value): """ return self._set(k=value) + @since("2.0.0") + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + @since("2.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("2.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + @since("2.0.0") + def setProbabilityCol(self, value): + """ + Sets the value of :py:attr:`probabilityCol`. + """ + return self._set(probabilityCol=value) + + @since("2.0.0") + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + @since("2.0.0") + def setTol(self, value): + """ + Sets the value of :py:attr:`tol`. + """ + return self._set(tol=value) + class GaussianMixtureSummary(ClusteringSummary): """ @@ -389,6 +457,20 @@ class KMeansModel(JavaModel, _KMeansParams, GeneralJavaMLWritable, JavaMLReadabl .. versionadded:: 1.5.0 """ + @since("3.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("3.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + @since("1.5.0") def clusterCenters(self): """Get the cluster centers, represented as a list of NumPy arrays.""" @@ -425,7 +507,14 @@ class KMeans(JavaEstimator, _KMeansParams, JavaMLWritable, JavaMLReadable): >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), ... (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] >>> df = spark.createDataFrame(data, ["features"]) - >>> kmeans = KMeans(k=2, seed=1) + >>> kmeans = KMeans(k=2) + >>> kmeans.setSeed(1) + KMeans... + >>> kmeans.setMaxIter(10) + KMeans... + >>> kmeans.getMaxIter() + 10 + >>> kmeans.clear(kmeans.maxIter) >>> model = kmeans.fit(df) >>> model.getDistanceMeasure() 'euclidean' @@ -531,6 +620,41 @@ def setDistanceMeasure(self, value): """ return self._set(distanceMeasure=value) + @since("1.5.0") + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + @since("1.5.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("1.5.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + @since("1.5.0") + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + @since("1.5.0") + def setTol(self, value): + """ + Sets the value of :py:attr:`tol`. + """ + return self._set(tol=value) + @inherit_doc class _BisectingKMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, @@ -571,6 +695,20 @@ class BisectingKMeansModel(JavaModel, _BisectingKMeansParams, JavaMLWritable, Ja .. versionadded:: 2.0.0 """ + @since("3.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("3.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + @since("2.0.0") def clusterCenters(self): """Get the cluster centers, represented as a list of NumPy arrays.""" @@ -629,6 +767,16 @@ class BisectingKMeans(JavaEstimator, _BisectingKMeansParams, JavaMLWritable, Jav ... (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] >>> df = spark.createDataFrame(data, ["features"]) >>> bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0) + >>> bkm.setMaxIter(10) + BisectingKMeans... + >>> bkm.getMaxIter() + 10 + >>> bkm.clear(bkm.maxIter) + >>> bkm.setSeed(1) + BisectingKMeans... + >>> bkm.getSeed() + 1 + >>> bkm.clear(bkm.seed) >>> model = bkm.fit(df) >>> model.getMaxIter() 20 @@ -723,6 +871,34 @@ def setDistanceMeasure(self, value): """ return self._set(distanceMeasure=value) + @since("2.0.0") + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + @since("2.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("2.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + @since("2.0.0") + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + def _create_model(self, java_model): return BisectingKMeansModel(java_model) @@ -873,6 +1049,31 @@ class LDAModel(JavaModel, _LDAParams): .. versionadded:: 2.0.0 """ + @since("3.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("3.0.0") + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + @since("3.0.0") + def setTopicDistributionCol(self, value): + """ + Sets the value of :py:attr:`topicDistributionCol`. + + >>> algo = LDA().setTopicDistributionCol("topicDistributionCol") + >>> algo.getTopicDistributionCol() + 'topicDistributionCol' + """ + return self._set(topicDistributionCol=value) + @since("2.0.0") def isDistributed(self): """ @@ -1045,6 +1246,11 @@ class LDA(JavaEstimator, _LDAParams, JavaMLReadable, JavaMLWritable): >>> df = spark.createDataFrame([[1, Vectors.dense([0.0, 1.0])], ... [2, SparseVector(2, {0: 1.0})],], ["id", "features"]) >>> lda = LDA(k=2, seed=1, optimizer="em") + >>> lda.setMaxIter(10) + LDA... + >>> lda.getMaxIter() + 10 + >>> lda.clear(lda.maxIter) >>> model = lda.fit(df) >>> model.getTopicDistributionCol() 'topicDistribution' @@ -1125,6 +1331,20 @@ def setParams(self, featuresCol="features", maxIter=20, seed=None, checkpointInt kwargs = self._input_kwargs return self._set(**kwargs) + @since("2.0.0") + def setCheckpointInterval(self, value): + """ + Sets the value of :py:attr:`checkpointInterval`. + """ + return self._set(checkpointInterval=value) + + @since("2.0.0") + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + @since("2.0.0") def setK(self, value): """ @@ -1236,6 +1456,20 @@ def setKeepLastCheckpoint(self, value): """ return self._set(keepLastCheckpoint=value) + @since("2.0.0") + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + @since("2.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + @inherit_doc class _PowerIterationClusteringParams(HasMaxIter, HasWeightCol): @@ -1392,6 +1626,20 @@ def setDstCol(self, value): """ return self._set(dstCol=value) + @since("2.4.0") + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + @since("2.4.0") + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + @since("2.4.0") def assignClusters(self, dataset): """ diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index cdd9be7bf11b3..b55f3e1ca459e 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -196,6 +196,25 @@ def getNumBins(self): """ return self.getOrDefault(self.numBins) + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + + def setRawPredictionCol(self, value): + """ + Sets the value of :py:attr:`rawPredictionCol`. + """ + return self._set(rawPredictionCol=value) + + @since("3.0.0") + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + @keyword_only @since("1.4.0") def setParams(self, rawPredictionCol="rawPrediction", labelCol="label", @@ -299,6 +318,25 @@ def getThroughOrigin(self): """ return self.getOrDefault(self.throughOrigin) + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + @since("3.0.0") + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + @keyword_only @since("1.4.0") def setParams(self, predictionCol="prediction", labelCol="label", @@ -453,6 +491,32 @@ def getEps(self): """ return self.getOrDefault(self.eps) + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + @since("3.0.0") + def setProbabilityCol(self, value): + """ + Sets the value of :py:attr:`probabilityCol`. + """ + return self._set(probabilityCol=value) + + @since("3.0.0") + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + @keyword_only @since("1.5.0") def setParams(self, predictionCol="prediction", labelCol="label", @@ -549,6 +613,20 @@ def getMetricLabel(self): """ return self.getOrDefault(self.metricLabel) + @since("3.0.0") + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + + @since("3.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + @keyword_only @since("3.0.0") def setParams(self, predictionCol="prediction", labelCol="label", @@ -613,6 +691,18 @@ def __init__(self, predictionCol="prediction", featuresCol="features", kwargs = self._input_kwargs self._set(**kwargs) + @keyword_only + @since("2.3.0") + def setParams(self, predictionCol="prediction", featuresCol="features", + metricName="silhouette", distanceMeasure="squaredEuclidean"): + """ + setParams(self, predictionCol="prediction", featuresCol="features", \ + metricName="silhouette", distanceMeasure="squaredEuclidean") + Sets params for clustering evaluator. + """ + kwargs = self._input_kwargs + return self._set(**kwargs) + @since("2.3.0") def setMetricName(self, value): """ @@ -627,18 +717,6 @@ def getMetricName(self): """ return self.getOrDefault(self.metricName) - @keyword_only - @since("2.3.0") - def setParams(self, predictionCol="prediction", featuresCol="features", - metricName="silhouette", distanceMeasure="squaredEuclidean"): - """ - setParams(self, predictionCol="prediction", featuresCol="features", \ - metricName="silhouette", distanceMeasure="squaredEuclidean") - Sets params for clustering evaluator. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - @since("2.4.0") def setDistanceMeasure(self, value): """ @@ -653,6 +731,18 @@ def getDistanceMeasure(self): """ return self.getOrDefault(self.distanceMeasure) + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + @inherit_doc class RankingEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, @@ -734,6 +824,20 @@ def getK(self): """ return self.getOrDefault(self.k) + @since("3.0.0") + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + + @since("3.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + @keyword_only @since("3.0.0") def setParams(self, predictionCol="prediction", labelCol="label", diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index a0883f1d54fed..9c34d98518e65 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -154,6 +154,32 @@ def setThresholds(self, value): """ return self._set(thresholds=value) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + @since("3.0.0") + def setOutputCols(self, value): + """ + Sets the value of :py:attr:`outputCols`. + """ + return self._set(outputCols=value) + class _LSHParams(HasInputCol, HasOutputCol): """ @@ -183,12 +209,36 @@ def setNumHashTables(self, value): """ return self._set(numHashTables=value) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + class _LSHModel(JavaModel, _LSHParams): """ Mixin for Locality Sensitive Hashing (LSH) models. """ + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + def approxNearestNeighbors(self, dataset, key, numNearestNeighbors, distCol="distCol"): """ Given a large dataset and an item, approximately find at most k items which have the @@ -350,6 +400,24 @@ def setBucketLength(self, value): """ return self._set(bucketLength=value) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + def _create_model(self, java_model): return BucketedRandomProjectionLSHModel(java_model) @@ -366,6 +434,20 @@ class BucketedRandomProjectionLSHModel(_LSHModel, _BucketedRandomProjectionLSHPa .. versionadded:: 2.2.0 """ + @since("3.0.0") + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @inherit_doc class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOutputCols, @@ -510,6 +592,38 @@ def getSplitsArray(self): """ return self.getOrDefault(self.splitsArray) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + @since("3.0.0") + def setOutputCols(self, value): + """ + Sets the value of :py:attr:`outputCols`. + """ + return self._set(outputCols=value) + + def setHandleInvalid(self, value): + """ + Sets the value of :py:attr:`handleInvalid`. + """ + return self._set(handleInvalid=value) + class _CountVectorizerParams(JavaParams, HasInputCol, HasOutputCol): """ @@ -695,6 +809,18 @@ def setBinary(self, value): """ return self._set(binary=value) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + def _create_model(self, java_model): return CountVectorizerModel(java_model) @@ -707,6 +833,34 @@ class CountVectorizerModel(JavaModel, _CountVectorizerParams, JavaMLReadable, Ja .. versionadded:: 1.6.0 """ + @since("3.0.0") + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + @since("3.0.0") + def setMinTF(self, value): + """ + Sets the value of :py:attr:`minTF`. + """ + return self._set(minTF=value) + + @since("3.0.0") + def setBinary(self, value): + """ + Sets the value of :py:attr:`binary`. + """ + return self._set(binary=value) + @classmethod @since("2.4.0") def from_vocabulary(cls, vocabulary, inputCol, outputCol=None, minTF=None, binary=None): @@ -978,6 +1132,24 @@ def getCategoricalCols(self): """ return self.getOrDefault(self.categoricalCols) + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + def setNumFeatures(self, value): + """ + Sets the value of :py:attr:`numFeatures`. + """ + return self._set(numFeatures=value) + @inherit_doc class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable, @@ -1050,6 +1222,24 @@ def getBinary(self): """ return self.getOrDefault(self.binary) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + def setNumFeatures(self, value): + """ + Sets the value of :py:attr:`numFeatures`. + """ + return self._set(numFeatures=value) + @since("3.0.0") def indexOf(self, term): """ @@ -1145,6 +1335,18 @@ def setMinDocFreq(self, value): """ return self._set(minDocFreq=value) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + def _create_model(self, java_model): return IDFModel(java_model) @@ -1156,6 +1358,20 @@ class IDFModel(JavaModel, _IDFParams, JavaMLReadable, JavaMLWritable): .. versionadded:: 1.4.0 """ + @since("3.0.0") + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @property @since("2.0.0") def idf(self): @@ -1308,6 +1524,20 @@ def setMissingValue(self, value): """ return self._set(missingValue=value) + @since("2.2.0") + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + @since("2.2.0") + def setOutputCols(self, value): + """ + Sets the value of :py:attr:`outputCols`. + """ + return self._set(outputCols=value) + def _create_model(self, java_model): return ImputerModel(java_model) @@ -1319,6 +1549,20 @@ class ImputerModel(JavaModel, _ImputerParams, JavaMLReadable, JavaMLWritable): .. versionadded:: 2.2.0 """ + @since("3.0.0") + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + @since("3.0.0") + def setOutputCols(self, value): + """ + Sets the value of :py:attr:`outputCols`. + """ + return self._set(outputCols=value) + @property @since("2.2.0") def surrogateDF(self): @@ -1381,6 +1625,20 @@ def setParams(self, inputCols=None, outputCol=None): kwargs = self._input_kwargs return self._set(**kwargs) + @since("3.0.0") + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + @since("3.0.0") + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + class _MaxAbsScalerParams(HasInputCol, HasOutputCol): """ @@ -1449,6 +1707,18 @@ def setParams(self, inputCol=None, outputCol=None): kwargs = self._input_kwargs return self._set(**kwargs) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + def _create_model(self, java_model): return MaxAbsScalerModel(java_model) @@ -1460,6 +1730,20 @@ class MaxAbsScalerModel(JavaModel, _MaxAbsScalerParams, JavaMLReadable, JavaMLWr .. versionadded:: 2.0.0 """ + @since("3.0.0") + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @property @since("2.0.0") def maxAbs(self): @@ -1544,6 +1828,12 @@ def setParams(self, inputCol=None, outputCol=None, seed=None, numHashTables=1): kwargs = self._input_kwargs return self._set(**kwargs) + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + def _create_model(self, java_model): return MinHashLSHModel(java_model) @@ -1675,6 +1965,18 @@ def setMax(self, value): """ return self._set(max=value) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + def _create_model(self, java_model): return MinMaxScalerModel(java_model) @@ -1686,6 +1988,34 @@ class MinMaxScalerModel(JavaModel, _MinMaxScalerParams, JavaMLReadable, JavaMLWr .. versionadded:: 1.6.0 """ + @since("3.0.0") + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + @since("3.0.0") + def setMin(self, value): + """ + Sets the value of :py:attr:`min`. + """ + return self._set(min=value) + + @since("3.0.0") + def setMax(self, value): + """ + Sets the value of :py:attr:`max`. + """ + return self._set(max=value) + @property @since("2.0.0") def originalMin(self): @@ -1944,6 +2274,27 @@ def setDropLast(self, value): """ return self._set(dropLast=value) + @since("3.0.0") + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + @since("3.0.0") + def setOutputCols(self, value): + """ + Sets the value of :py:attr:`outputCols`. + """ + return self._set(outputCols=value) + + @since("3.0.0") + def setHandleInvalid(self, value): + """ + Sets the value of :py:attr:`handleInvalid`. + """ + return self._set(handleInvalid=value) + def _create_model(self, java_model): return OneHotEncoderModel(java_model) @@ -1955,6 +2306,34 @@ class OneHotEncoderModel(JavaModel, _OneHotEncoderParams, JavaMLReadable, JavaML .. versionadded:: 2.3.0 """ + @since("3.0.0") + def setDropLast(self, value): + """ + Sets the value of :py:attr:`dropLast`. + """ + return self._set(dropLast=value) + + @since("3.0.0") + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + @since("3.0.0") + def setOutputCols(self, value): + """ + Sets the value of :py:attr:`outputCols`. + """ + return self._set(outputCols=value) + + @since("3.0.0") + def setHandleInvalid(self, value): + """ + Sets the value of :py:attr:`handleInvalid`. + """ + return self._set(handleInvalid=value) + @property @since("2.3.0") def categorySizes(self): @@ -2213,6 +2592,38 @@ def getRelativeError(self): """ return self.getOrDefault(self.relativeError) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + @since("3.0.0") + def setOutputCols(self, value): + """ + Sets the value of :py:attr:`outputCols`. + """ + return self._set(outputCols=value) + + def setHandleInvalid(self, value): + """ + Sets the value of :py:attr:`handleInvalid`. + """ + return self._set(handleInvalid=value) + def _create_model(self, java_model): """ Private method to convert the java_model to a Python model. @@ -2373,6 +2784,20 @@ def setWithScaling(self, value): """ return self._set(withScaling=value) + @since("3.0.0") + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + def _create_model(self, java_model): return RobustScalerModel(java_model) @@ -2384,6 +2809,20 @@ class RobustScalerModel(JavaModel, _RobustScalerParams, JavaMLReadable, JavaMLWr .. versionadded:: 3.0.0 """ + @since("3.0.0") + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @property @since("3.0.0") def median(self): @@ -2694,6 +3133,18 @@ def setWithStd(self, value): """ return self._set(withStd=value) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + def _create_model(self, java_model): return StandardScalerModel(java_model) @@ -2705,6 +3156,18 @@ class StandardScalerModel(JavaModel, _StandardScalerParams, JavaMLReadable, Java .. versionadded:: 1.4.0 """ + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @property @since("2.0.0") def std(self): @@ -2866,6 +3329,38 @@ def setStringOrderType(self, value): """ return self._set(stringOrderType=value) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + @since("3.0.0") + def setOutputCols(self, value): + """ + Sets the value of :py:attr:`outputCols`. + """ + return self._set(outputCols=value) + + def setHandleInvalid(self, value): + """ + Sets the value of :py:attr:`handleInvalid`. + """ + return self._set(handleInvalid=value) + class StringIndexerModel(JavaModel, _StringIndexerParams, JavaMLReadable, JavaMLWritable): """ @@ -2874,6 +3369,39 @@ class StringIndexerModel(JavaModel, _StringIndexerParams, JavaMLReadable, JavaML .. versionadded:: 1.4.0 """ + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + @since("3.0.0") + def setOutputCols(self, value): + """ + Sets the value of :py:attr:`outputCols`. + """ + return self._set(outputCols=value) + + @since("2.4.0") + def setHandleInvalid(self, value): + """ + Sets the value of :py:attr:`handleInvalid`. + """ + return self._set(handleInvalid=value) + @classmethod @since("2.4.0") def from_labels(cls, labels, inputCol, outputCol=None, handleInvalid=None): @@ -2921,13 +3449,6 @@ def labels(self): """ return self._call_java("labels") - @since("2.4.0") - def setHandleInvalid(self, value): - """ - Sets the value of :py:attr:`handleInvalid`. - """ - return self._set(handleInvalid=value) - @inherit_doc class IndexToString(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): @@ -2981,6 +3502,18 @@ def getLabels(self): """ return self.getOrDefault(self.labels) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ @@ -3079,6 +3612,18 @@ def getLocale(self): """ return self.getOrDefault(self.locale) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @staticmethod @since("2.0.0") def loadDefaultStopWords(language): @@ -3220,6 +3765,24 @@ def setParams(self, inputCols=None, outputCol=None, handleInvalid="error"): kwargs = self._input_kwargs return self._set(**kwargs) + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCols`. + """ + return self._set(inputCols=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + def setHandleInvalid(self, value): + """ + Sets the value of :py:attr:`handleInvalid`. + """ + return self._set(handleInvalid=value) + class _VectorIndexerParams(HasInputCol, HasOutputCol, HasHandleInvalid): """ @@ -3359,6 +3922,24 @@ def setMaxCategories(self, value): """ return self._set(maxCategories=value) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + def setHandleInvalid(self, value): + """ + Sets the value of :py:attr:`handleInvalid`. + """ + return self._set(handleInvalid=value) + def _create_model(self, java_model): return VectorIndexerModel(java_model) @@ -3380,6 +3961,20 @@ class VectorIndexerModel(JavaModel, _VectorIndexerParams, JavaMLReadable, JavaML .. versionadded:: 1.4.0 """ + @since("3.0.0") + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @property @since("1.4.0") def numFeatures(self): @@ -3488,6 +4083,18 @@ def getNames(self): """ return self.getOrDefault(self.names) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + class _Word2VecParams(HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol): """ @@ -3560,6 +4167,11 @@ class Word2Vec(JavaEstimator, _Word2VecParams, JavaMLReadable, JavaMLWritable): >>> sent = ("a b " * 100 + "a c " * 10).split(" ") >>> doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"]) >>> word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model") + >>> word2Vec.setMaxIter(10) + Word2Vec... + >>> word2Vec.getMaxIter() + 10 + >>> word2Vec.clear(word2Vec.maxIter) >>> model = word2Vec.fit(doc) >>> model.getMinCount() 5 @@ -3666,12 +4278,36 @@ def setMaxSentenceLength(self, value): """ return self._set(maxSentenceLength=value) - @since("2.0.0") - def getMaxSentenceLength(self): + def setMaxIter(self, value): """ - Gets the value of maxSentenceLength or its default value. + Sets the value of :py:attr:`maxIter`. """ - return self.getOrDefault(self.maxSentenceLength) + return self._set(maxIter=value) + + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + @since("1.4.0") + def setStepSize(self, value): + """ + Sets the value of :py:attr:`stepSize`. + """ + return self._set(stepSize=value) def _create_model(self, java_model): return Word2VecModel(java_model) @@ -3692,6 +4328,18 @@ def getVectors(self): """ return self._call_java("getVectors") + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @since("1.5.0") def findSynonyms(self, word, num): """ @@ -3800,6 +4448,18 @@ def setK(self, value): """ return self._set(k=value) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + def _create_model(self, java_model): return PCAModel(java_model) @@ -3811,6 +4471,20 @@ class PCAModel(JavaModel, _PCAParams, JavaMLReadable, JavaMLWritable): .. versionadded:: 1.5.0 """ + @since("3.0.0") + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + @since("3.0.0") + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @property @since("2.0.0") def pc(self): @@ -4001,6 +4675,24 @@ def setStringIndexerOrderType(self, value): """ return self._set(stringIndexerOrderType=value) + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + + def setHandleInvalid(self, value): + """ + Sets the value of :py:attr:`handleInvalid`. + """ + return self._set(handleInvalid=value) + def _create_model(self, java_model): return RFormulaModel(java_model) @@ -4228,6 +4920,24 @@ def setFwe(self, value): """ return self._set(fwe=value) + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + def _create_model(self, java_model): return ChiSqSelectorModel(java_model) @@ -4239,6 +4949,20 @@ class ChiSqSelectorModel(JavaModel, _ChiSqSelectorParams, JavaMLReadable, JavaML .. versionadded:: 2.0.0 """ + @since("3.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("3.0.0") + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @property @since("2.0.0") def selectedFeatures(self): @@ -4323,6 +5047,18 @@ def setSize(self, value): """ Sets size param, the size of vectors in `inputCol`.""" return self._set(size=value) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setHandleInvalid(self, value): + """ + Sets the value of :py:attr:`handleInvalid`. + """ + return self._set(handleInvalid=value) + if __name__ == "__main__": import doctest diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py index 652acbb34a901..5b34d555484d1 100644 --- a/python/pyspark/ml/fpm.py +++ b/python/pyspark/ml/fpm.py @@ -102,6 +102,13 @@ def setMinConfidence(self, value): """ return self._set(minConfidence=value) + @since("3.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + @property @since("2.2.0") def freqItemsets(self): @@ -239,6 +246,12 @@ def setMinConfidence(self, value): """ return self._set(minConfidence=value) + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + def _create_model(self, java_model): return FPGrowthModel(java_model) diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index c99ec3f467ac6..8ea94e4760007 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -81,12 +81,6 @@ def _gen_param_code(name, doc, defaultValueStr): """ # TODO: How to correctly inherit instance attributes? template = ''' - def set$Name(self, value): - """ - Sets the value of :py:attr:`$name`. - """ - return self._set($name=value) - def get$Name(self): """ Gets the value of $name or its default value. diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index 771b4bcd9ba02..26d74fab6975a 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -30,12 +30,6 @@ class HasMaxIter(Params): def __init__(self): super(HasMaxIter, self).__init__() - def setMaxIter(self, value): - """ - Sets the value of :py:attr:`maxIter`. - """ - return self._set(maxIter=value) - def getMaxIter(self): """ Gets the value of maxIter or its default value. @@ -53,12 +47,6 @@ class HasRegParam(Params): def __init__(self): super(HasRegParam, self).__init__() - def setRegParam(self, value): - """ - Sets the value of :py:attr:`regParam`. - """ - return self._set(regParam=value) - def getRegParam(self): """ Gets the value of regParam or its default value. @@ -77,12 +65,6 @@ def __init__(self): super(HasFeaturesCol, self).__init__() self._setDefault(featuresCol='features') - def setFeaturesCol(self, value): - """ - Sets the value of :py:attr:`featuresCol`. - """ - return self._set(featuresCol=value) - def getFeaturesCol(self): """ Gets the value of featuresCol or its default value. @@ -101,12 +83,6 @@ def __init__(self): super(HasLabelCol, self).__init__() self._setDefault(labelCol='label') - def setLabelCol(self, value): - """ - Sets the value of :py:attr:`labelCol`. - """ - return self._set(labelCol=value) - def getLabelCol(self): """ Gets the value of labelCol or its default value. @@ -125,12 +101,6 @@ def __init__(self): super(HasPredictionCol, self).__init__() self._setDefault(predictionCol='prediction') - def setPredictionCol(self, value): - """ - Sets the value of :py:attr:`predictionCol`. - """ - return self._set(predictionCol=value) - def getPredictionCol(self): """ Gets the value of predictionCol or its default value. @@ -149,12 +119,6 @@ def __init__(self): super(HasProbabilityCol, self).__init__() self._setDefault(probabilityCol='probability') - def setProbabilityCol(self, value): - """ - Sets the value of :py:attr:`probabilityCol`. - """ - return self._set(probabilityCol=value) - def getProbabilityCol(self): """ Gets the value of probabilityCol or its default value. @@ -173,12 +137,6 @@ def __init__(self): super(HasRawPredictionCol, self).__init__() self._setDefault(rawPredictionCol='rawPrediction') - def setRawPredictionCol(self, value): - """ - Sets the value of :py:attr:`rawPredictionCol`. - """ - return self._set(rawPredictionCol=value) - def getRawPredictionCol(self): """ Gets the value of rawPredictionCol or its default value. @@ -196,12 +154,6 @@ class HasInputCol(Params): def __init__(self): super(HasInputCol, self).__init__() - def setInputCol(self, value): - """ - Sets the value of :py:attr:`inputCol`. - """ - return self._set(inputCol=value) - def getInputCol(self): """ Gets the value of inputCol or its default value. @@ -219,12 +171,6 @@ class HasInputCols(Params): def __init__(self): super(HasInputCols, self).__init__() - def setInputCols(self, value): - """ - Sets the value of :py:attr:`inputCols`. - """ - return self._set(inputCols=value) - def getInputCols(self): """ Gets the value of inputCols or its default value. @@ -243,12 +189,6 @@ def __init__(self): super(HasOutputCol, self).__init__() self._setDefault(outputCol=self.uid + '__output') - def setOutputCol(self, value): - """ - Sets the value of :py:attr:`outputCol`. - """ - return self._set(outputCol=value) - def getOutputCol(self): """ Gets the value of outputCol or its default value. @@ -266,12 +206,6 @@ class HasOutputCols(Params): def __init__(self): super(HasOutputCols, self).__init__() - def setOutputCols(self, value): - """ - Sets the value of :py:attr:`outputCols`. - """ - return self._set(outputCols=value) - def getOutputCols(self): """ Gets the value of outputCols or its default value. @@ -290,12 +224,6 @@ def __init__(self): super(HasNumFeatures, self).__init__() self._setDefault(numFeatures=262144) - def setNumFeatures(self, value): - """ - Sets the value of :py:attr:`numFeatures`. - """ - return self._set(numFeatures=value) - def getNumFeatures(self): """ Gets the value of numFeatures or its default value. @@ -313,12 +241,6 @@ class HasCheckpointInterval(Params): def __init__(self): super(HasCheckpointInterval, self).__init__() - def setCheckpointInterval(self, value): - """ - Sets the value of :py:attr:`checkpointInterval`. - """ - return self._set(checkpointInterval=value) - def getCheckpointInterval(self): """ Gets the value of checkpointInterval or its default value. @@ -337,12 +259,6 @@ def __init__(self): super(HasSeed, self).__init__() self._setDefault(seed=hash(type(self).__name__)) - def setSeed(self, value): - """ - Sets the value of :py:attr:`seed`. - """ - return self._set(seed=value) - def getSeed(self): """ Gets the value of seed or its default value. @@ -360,12 +276,6 @@ class HasTol(Params): def __init__(self): super(HasTol, self).__init__() - def setTol(self, value): - """ - Sets the value of :py:attr:`tol`. - """ - return self._set(tol=value) - def getTol(self): """ Gets the value of tol or its default value. @@ -383,12 +293,6 @@ class HasStepSize(Params): def __init__(self): super(HasStepSize, self).__init__() - def setStepSize(self, value): - """ - Sets the value of :py:attr:`stepSize`. - """ - return self._set(stepSize=value) - def getStepSize(self): """ Gets the value of stepSize or its default value. @@ -406,12 +310,6 @@ class HasHandleInvalid(Params): def __init__(self): super(HasHandleInvalid, self).__init__() - def setHandleInvalid(self, value): - """ - Sets the value of :py:attr:`handleInvalid`. - """ - return self._set(handleInvalid=value) - def getHandleInvalid(self): """ Gets the value of handleInvalid or its default value. @@ -430,12 +328,6 @@ def __init__(self): super(HasElasticNetParam, self).__init__() self._setDefault(elasticNetParam=0.0) - def setElasticNetParam(self, value): - """ - Sets the value of :py:attr:`elasticNetParam`. - """ - return self._set(elasticNetParam=value) - def getElasticNetParam(self): """ Gets the value of elasticNetParam or its default value. @@ -454,12 +346,6 @@ def __init__(self): super(HasFitIntercept, self).__init__() self._setDefault(fitIntercept=True) - def setFitIntercept(self, value): - """ - Sets the value of :py:attr:`fitIntercept`. - """ - return self._set(fitIntercept=value) - def getFitIntercept(self): """ Gets the value of fitIntercept or its default value. @@ -478,12 +364,6 @@ def __init__(self): super(HasStandardization, self).__init__() self._setDefault(standardization=True) - def setStandardization(self, value): - """ - Sets the value of :py:attr:`standardization`. - """ - return self._set(standardization=value) - def getStandardization(self): """ Gets the value of standardization or its default value. @@ -501,12 +381,6 @@ class HasThresholds(Params): def __init__(self): super(HasThresholds, self).__init__() - def setThresholds(self, value): - """ - Sets the value of :py:attr:`thresholds`. - """ - return self._set(thresholds=value) - def getThresholds(self): """ Gets the value of thresholds or its default value. @@ -525,12 +399,6 @@ def __init__(self): super(HasThreshold, self).__init__() self._setDefault(threshold=0.5) - def setThreshold(self, value): - """ - Sets the value of :py:attr:`threshold`. - """ - return self._set(threshold=value) - def getThreshold(self): """ Gets the value of threshold or its default value. @@ -548,12 +416,6 @@ class HasWeightCol(Params): def __init__(self): super(HasWeightCol, self).__init__() - def setWeightCol(self, value): - """ - Sets the value of :py:attr:`weightCol`. - """ - return self._set(weightCol=value) - def getWeightCol(self): """ Gets the value of weightCol or its default value. @@ -572,12 +434,6 @@ def __init__(self): super(HasSolver, self).__init__() self._setDefault(solver='auto') - def setSolver(self, value): - """ - Sets the value of :py:attr:`solver`. - """ - return self._set(solver=value) - def getSolver(self): """ Gets the value of solver or its default value. @@ -595,12 +451,6 @@ class HasVarianceCol(Params): def __init__(self): super(HasVarianceCol, self).__init__() - def setVarianceCol(self, value): - """ - Sets the value of :py:attr:`varianceCol`. - """ - return self._set(varianceCol=value) - def getVarianceCol(self): """ Gets the value of varianceCol or its default value. @@ -619,12 +469,6 @@ def __init__(self): super(HasAggregationDepth, self).__init__() self._setDefault(aggregationDepth=2) - def setAggregationDepth(self, value): - """ - Sets the value of :py:attr:`aggregationDepth`. - """ - return self._set(aggregationDepth=value) - def getAggregationDepth(self): """ Gets the value of aggregationDepth or its default value. @@ -643,12 +487,6 @@ def __init__(self): super(HasParallelism, self).__init__() self._setDefault(parallelism=1) - def setParallelism(self, value): - """ - Sets the value of :py:attr:`parallelism`. - """ - return self._set(parallelism=value) - def getParallelism(self): """ Gets the value of parallelism or its default value. @@ -667,12 +505,6 @@ def __init__(self): super(HasCollectSubModels, self).__init__() self._setDefault(collectSubModels=False) - def setCollectSubModels(self, value): - """ - Sets the value of :py:attr:`collectSubModels`. - """ - return self._set(collectSubModels=value) - def getCollectSubModels(self): """ Gets the value of collectSubModels or its default value. @@ -690,12 +522,6 @@ class HasLoss(Params): def __init__(self): super(HasLoss, self).__init__() - def setLoss(self, value): - """ - Sets the value of :py:attr:`loss`. - """ - return self._set(loss=value) - def getLoss(self): """ Gets the value of loss or its default value. @@ -714,12 +540,6 @@ def __init__(self): super(HasDistanceMeasure, self).__init__() self._setDefault(distanceMeasure='euclidean') - def setDistanceMeasure(self, value): - """ - Sets the value of :py:attr:`distanceMeasure`. - """ - return self._set(distanceMeasure=value) - def getDistanceMeasure(self): """ Gets the value of distanceMeasure or its default value. @@ -737,12 +557,6 @@ class HasValidationIndicatorCol(Params): def __init__(self): super(HasValidationIndicatorCol, self).__init__() - def setValidationIndicatorCol(self, value): - """ - Sets the value of :py:attr:`validationIndicatorCol`. - """ - return self._set(validationIndicatorCol=value) - def getValidationIndicatorCol(self): """ Gets the value of validationIndicatorCol or its default value. diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index df9c765457ec1..3ebd0ac2765f3 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -212,7 +212,16 @@ class ALS(JavaEstimator, _ALSParams, JavaMLWritable, JavaMLReadable): >>> df = spark.createDataFrame( ... [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)], ... ["user", "item", "rating"]) - >>> als = ALS(rank=10, maxIter=5, seed=0) + >>> als = ALS(rank=10, seed=0) + >>> als.setMaxIter(5) + ALS... + >>> als.getMaxIter() + 5 + >>> als.setRegParam(0.1) + ALS... + >>> als.getRegParam() + 0.1 + >>> als.clear(als.regParam) >>> model = als.fit(df) >>> model.getUserCol() 'user' @@ -402,6 +411,36 @@ def setColdStartStrategy(self, value): """ return self._set(coldStartStrategy=value) + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + def setRegParam(self, value): + """ + Sets the value of :py:attr:`regParam`. + """ + return self._set(regParam=value) + + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + def setCheckpointInterval(self, value): + """ + Sets the value of :py:attr:`checkpointInterval`. + """ + return self._set(checkpointInterval=value) + + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + class ALSModel(JavaModel, _ALSModelParams, JavaMLWritable, JavaMLReadable): """ @@ -431,6 +470,13 @@ def setColdStartStrategy(self, value): """ return self._set(coldStartStrategy=value) + @since("3.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + @property @since("1.4.0") def rank(self): diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 147ebed1d633a..ca967ea40f3cd 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -92,7 +92,17 @@ class LinearRegression(JavaPredictor, _LinearRegressionParams, JavaMLWritable, J >>> df = spark.createDataFrame([ ... (1.0, 2.0, Vectors.dense(1.0)), ... (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) - >>> lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight") + >>> lr = LinearRegression(regParam=0.0, solver="normal", weightCol="weight") + >>> lr.setMaxIter(5) + LinearRegression... + >>> lr.getMaxIter() + 5 + >>> lr.setRegParam(0.1) + LinearRegression... + >>> lr.getRegParam() + 0.1 + >>> lr.setRegParam(0.0) + LinearRegression... >>> model = lr.fit(df) >>> model.setFeaturesCol("features") LinearRegression... @@ -179,6 +189,66 @@ def setEpsilon(self, value): """ return self._set(epsilon=value) + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + def setRegParam(self, value): + """ + Sets the value of :py:attr:`regParam`. + """ + return self._set(regParam=value) + + def setTol(self, value): + """ + Sets the value of :py:attr:`tol`. + """ + return self._set(tol=value) + + def setElasticNetParam(self, value): + """ + Sets the value of :py:attr:`elasticNetParam`. + """ + return self._set(elasticNetParam=value) + + def setFitIntercept(self, value): + """ + Sets the value of :py:attr:`fitIntercept`. + """ + return self._set(fitIntercept=value) + + def setStandardization(self, value): + """ + Sets the value of :py:attr:`standardization`. + """ + return self._set(standardization=value) + + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + + def setSolver(self, value): + """ + Sets the value of :py:attr:`solver`. + """ + return self._set(solver=value) + + def setAggregationDepth(self, value): + """ + Sets the value of :py:attr:`aggregationDepth`. + """ + return self._set(aggregationDepth=value) + + def setLoss(self, value): + """ + Sets the value of :py:attr:`loss`. + """ + return self._set(lossType=value) + class LinearRegressionModel(JavaPredictionModel, _LinearRegressionParams, GeneralJavaMLWritable, JavaMLReadable, HasTrainingSummary): @@ -522,10 +592,6 @@ class IsotonicRegression(JavaEstimator, _IsotonicRegressionParams, HasWeightCol, >>> model = ir.fit(df) >>> model.setFeaturesCol("features") IsotonicRegression... - >>> model.setLabelCol("newLabel") - IsotonicRegression... - >>> model.getLabelCol() - 'newLabel' >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -586,6 +652,34 @@ def setFeatureIndex(self, value): """ return self._set(featureIndex=value) + @since("1.6.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("1.6.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + @since("1.6.0") + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + + @since("1.6.0") + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + class IsotonicRegressionModel(JavaModel, _IsotonicRegressionParams, JavaMLWritable, JavaMLReadable): @@ -595,6 +689,26 @@ class IsotonicRegressionModel(JavaModel, _IsotonicRegressionParams, JavaMLWritab .. versionadded:: 1.6.0 """ + @since("3.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("3.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + def setFeatureIndex(self, value): + """ + Sets the value of :py:attr:`featureIndex`. + """ + return self._set(featureIndex=value) + @property @since("1.6.0") def boundaries(self): @@ -732,18 +846,21 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre def _create_model(self, java_model): return DecisionTreeRegressionModel(java_model) + @since("1.4.0") def setMaxDepth(self, value): """ Sets the value of :py:attr:`maxDepth`. """ return self._set(maxDepth=value) + @since("1.4.0") def setMaxBins(self, value): """ Sets the value of :py:attr:`maxBins`. """ return self._set(maxBins=value) + @since("1.4.0") def setMinInstancesPerNode(self, value): """ Sets the value of :py:attr:`minInstancesPerNode`. @@ -757,18 +874,21 @@ def setMinWeightFractionPerNode(self, value): """ return self._set(minWeightFractionPerNode=value) + @since("1.4.0") def setMinInfoGain(self, value): """ Sets the value of :py:attr:`minInfoGain`. """ return self._set(minInfoGain=value) + @since("1.4.0") def setMaxMemoryInMB(self, value): """ Sets the value of :py:attr:`maxMemoryInMB`. """ return self._set(maxMemoryInMB=value) + @since("1.4.0") def setCacheNodeIds(self, value): """ Sets the value of :py:attr:`cacheNodeIds`. @@ -782,6 +902,34 @@ def setImpurity(self, value): """ return self._set(impurity=value) + @since("1.4.0") + def setCheckpointInterval(self, value): + """ + Sets the value of :py:attr:`checkpointInterval`. + """ + return self._set(checkpointInterval=value) + + @since("1.4.0") + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + @since("3.0.0") + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + + @since("2.0.0") + def setVarianceCol(self, value): + """ + Sets the value of :py:attr:`varianceCol`. + """ + return self._set(varianceCol=value) + @inherit_doc class DecisionTreeRegressionModel(_DecisionTreeModel, _DecisionTreeRegressorParams, @@ -792,6 +940,13 @@ class DecisionTreeRegressionModel(_DecisionTreeModel, _DecisionTreeRegressorPara .. versionadded:: 1.4.0 """ + @since("3.0.0") + def setVarianceCol(self, value): + """ + Sets the value of :py:attr:`varianceCol`. + """ + return self._set(varianceCol=value) + @property @since("2.0.0") def featureImportances(self): @@ -987,6 +1142,18 @@ def setFeatureSubsetStrategy(self, value): """ return self._set(featureSubsetStrategy=value) + def setCheckpointInterval(self, value): + """ + Sets the value of :py:attr:`checkpointInterval`. + """ + return self._set(checkpointInterval=value) + + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + class RandomForestRegressionModel(_TreeEnsembleModel, _RandomForestRegressorParams, JavaMLWritable, JavaMLReadable): @@ -1052,7 +1219,11 @@ class GBTRegressor(JavaPredictor, _GBTRegressorParams, JavaMLWritable, JavaMLRea >>> df = spark.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42, leafCol="leafId") + >>> gbt = GBTRegressor(maxDepth=2, seed=42, leafCol="leafId") + >>> gbt.setMaxIter(5) + GBTRegressor... + >>> gbt.getMaxIter() + 5 >>> print(gbt.getImpurity()) variance >>> print(gbt.getFeatureSubsetStrategy()) @@ -1152,36 +1323,42 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre def _create_model(self, java_model): return GBTRegressionModel(java_model) + @since("1.4.0") def setMaxDepth(self, value): """ Sets the value of :py:attr:`maxDepth`. """ return self._set(maxDepth=value) + @since("1.4.0") def setMaxBins(self, value): """ Sets the value of :py:attr:`maxBins`. """ return self._set(maxBins=value) + @since("1.4.0") def setMinInstancesPerNode(self, value): """ Sets the value of :py:attr:`minInstancesPerNode`. """ return self._set(minInstancesPerNode=value) + @since("1.4.0") def setMinInfoGain(self, value): """ Sets the value of :py:attr:`minInfoGain`. """ return self._set(minInfoGain=value) + @since("1.4.0") def setMaxMemoryInMB(self, value): """ Sets the value of :py:attr:`maxMemoryInMB`. """ return self._set(maxMemoryInMB=value) + @since("1.4.0") def setCacheNodeIds(self, value): """ Sets the value of :py:attr:`cacheNodeIds`. @@ -1223,6 +1400,34 @@ def setValidationIndicatorCol(self, value): """ return self._set(validationIndicatorCol=value) + @since("1.4.0") + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + @since("1.4.0") + def setCheckpointInterval(self, value): + """ + Sets the value of :py:attr:`checkpointInterval`. + """ + return self._set(checkpointInterval=value) + + @since("1.4.0") + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + @since("1.4.0") + def setStepSize(self, value): + """ + Sets the value of :py:attr:`stepSize`. + """ + return self._set(stepSize=value) + class GBTRegressionModel(_TreeEnsembleModel, _GBTRegressorParams, JavaMLWritable, JavaMLReadable): """ @@ -1330,6 +1535,11 @@ class AFTSurvivalRegression(JavaEstimator, _AFTSurvivalRegressionParams, ... (1.0, Vectors.dense(1.0), 1.0), ... (1e-40, Vectors.sparse(1, [], []), 0.0)], ["label", "features", "censor"]) >>> aftsr = AFTSurvivalRegression() + >>> aftsr.setMaxIter(10) + AFTSurvivalRegression... + >>> aftsr.getMaxIter() + 10 + >>> aftsr.clear(aftsr.maxIter) >>> model = aftsr.fit(df) >>> model.setFeaturesCol("features") AFTSurvivalRegression... @@ -1422,6 +1632,55 @@ def setQuantilesCol(self, value): """ return self._set(quantilesCol=value) + @since("1.6.0") + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + @since("1.6.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("1.6.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + @since("1.6.0") + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + return self._set(labelCol=value) + + @since("1.6.0") + def setTol(self, value): + """ + Sets the value of :py:attr:`tol`. + """ + return self._set(tol=value) + + @since("1.6.0") + def setFitIntercept(self, value): + """ + Sets the value of :py:attr:`fitIntercept`. + """ + return self._set(fitIntercept=value) + + @since("2.1.0") + def setAggregationDepth(self, value): + """ + Sets the value of :py:attr:`aggregationDepth`. + """ + return self._set(aggregationDepth=value) + class AFTSurvivalRegressionModel(JavaModel, _AFTSurvivalRegressionParams, JavaMLWritable, JavaMLReadable): @@ -1431,6 +1690,34 @@ class AFTSurvivalRegressionModel(JavaModel, _AFTSurvivalRegressionParams, .. versionadded:: 1.6.0 """ + @since("3.0.0") + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + return self._set(featuresCol=value) + + @since("3.0.0") + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + return self._set(predictionCol=value) + + @since("3.0.0") + def setQuantileProbabilities(self, value): + """ + Sets the value of :py:attr:`quantileProbabilities`. + """ + return self._set(quantileProbabilities=value) + + @since("3.0.0") + def setQuantilesCol(self, value): + """ + Sets the value of :py:attr:`quantilesCol`. + """ + return self._set(quantilesCol=value) + @property @since("2.0.0") def coefficients(self): @@ -1577,6 +1864,16 @@ class GeneralizedLinearRegression(JavaPredictor, _GeneralizedLinearRegressionPar ... (2.0, Vectors.dense(0.0, 0.0)), ... (2.0, Vectors.dense(1.0, 1.0)),], ["label", "features"]) >>> glr = GeneralizedLinearRegression(family="gaussian", link="identity", linkPredictionCol="p") + >>> glr.setRegParam(0.1) + GeneralizedLinearRegression... + >>> glr.getRegParam() + 0.1 + >>> glr.clear(glr.regParam) + >>> glr.setMaxIter(10) + GeneralizedLinearRegression... + >>> glr.getMaxIter() + 10 + >>> glr.clear(glr.maxIter) >>> model = glr.fit(df) >>> model.setFeaturesCol("features") GeneralizedLinearRegression... @@ -1690,6 +1987,48 @@ def setOffsetCol(self, value): """ return self._set(offsetCol=value) + @since("2.0.0") + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + return self._set(maxIter=value) + + @since("2.0.0") + def setRegParam(self, value): + """ + Sets the value of :py:attr:`regParam`. + """ + return self._set(regParam=value) + + @since("2.0.0") + def setTol(self, value): + """ + Sets the value of :py:attr:`tol`. + """ + return self._set(tol=value) + + @since("2.2.0") + def setFitIntercept(self, value): + """ + Sets the value of :py:attr:`fitIntercept`. + """ + return self._set(fitIntercept=value) + + @since("2.0.0") + def setWeightCol(self, value): + """ + Sets the value of :py:attr:`weightCol`. + """ + return self._set(weightCol=value) + + @since("2.0.0") + def setSolver(self, value): + """ + Sets the value of :py:attr:`solver`. + """ + return self._set(solver=value) + class GeneralizedLinearRegressionModel(JavaPredictionModel, _GeneralizedLinearRegressionParams, JavaMLWritable, JavaMLReadable, HasTrainingSummary): @@ -1699,6 +2038,13 @@ class GeneralizedLinearRegressionModel(JavaPredictionModel, _GeneralizedLinearRe .. versionadded:: 2.0.0 """ + @since("3.0.0") + def setLinkPredictionCol(self, value): + """ + Sets the value of :py:attr:`linkPredictionCol`. + """ + return self._set(linkPredictionCol=value) + @property @since("2.0.0") def coefficients(self): diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index 4c7f01484dc21..75cd903b5d6d7 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -221,13 +221,6 @@ def test_params(self): self.assertFalse(testParams.isSet(maxIter)) self.assertTrue(testParams.isDefined(maxIter)) self.assertEqual(testParams.getMaxIter(), 10) - testParams.setMaxIter(100) - self.assertTrue(testParams.isSet(maxIter)) - self.assertEqual(testParams.getMaxIter(), 100) - testParams.clear(maxIter) - self.assertFalse(testParams.isSet(maxIter)) - self.assertEqual(testParams.getMaxIter(), 10) - testParams.setMaxIter(100) self.assertTrue(testParams.hasParam(inputCol.name)) self.assertFalse(testParams.hasDefault(inputCol)) @@ -244,13 +237,12 @@ def test_params(self): # Since the default is normally random, set it to a known number for debug str testParams._setDefault(seed=41) - testParams.setSeed(43) self.assertEqual( testParams.explainParams(), "\n".join(["inputCol: input column name. (undefined)", - "maxIter: max number of iterations (>= 0). (default: 10, current: 100)", - "seed: random seed. (default: 41, current: 43)"])) + "maxIter: max number of iterations (>= 0). (default: 10)", + "seed: random seed. (default: 41)"])) def test_clear_param(self): df = self.spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 8fa0183e4683d..652ff7f691119 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -291,6 +291,24 @@ def setNumFolds(self, value): """ return self._set(numFolds=value) + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + def setParallelism(self, value): + """ + Sets the value of :py:attr:`parallelism`. + """ + return self._set(parallelism=value) + + def setCollectSubModels(self, value): + """ + Sets the value of :py:attr:`collectSubModels`. + """ + return self._set(collectSubModels=value) + def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) @@ -624,6 +642,24 @@ def setTrainRatio(self, value): """ return self._set(trainRatio=value) + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + return self._set(seed=value) + + def setParallelism(self, value): + """ + Sets the value of :py:attr:`parallelism`. + """ + return self._set(parallelism=value) + + def setCollectSubModels(self, value): + """ + Sets the value of :py:attr:`collectSubModels`. + """ + return self._set(collectSubModels=value) + def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) From d076f9e4515ea4c52e760ceb8a41c645dd2cc65d Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 23 Oct 2019 16:55:31 -0700 Subject: [PATCH 2/4] add more doc tests --- .../apache/spark/ml/feature/MinHashLSH.scala | 14 ++ python/pyspark/ml/evaluation.py | 24 ++- python/pyspark/ml/feature.py | 193 ++++++++++++++---- python/pyspark/ml/regression.py | 8 +- python/pyspark/ml/wrapper.py | 17 +- 5 files changed, 205 insertions(+), 51 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala index a56aae65dd151..da0eaad667ccb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala @@ -48,6 +48,14 @@ class MinHashLSHModel private[ml]( private[ml] val randCoefficients: Array[(Int, Int)]) extends LSHModel[MinHashLSHModel] { + /** @group setParam */ + @Since("2.4.0") + override def setInputCol(value: String): this.type = super.set(inputCol, value) + + /** @group setParam */ + @Since("2.4.0") + override def setOutputCol(value: String): this.type = super.set(outputCol, value) + @Since("2.1.0") override protected[ml] def hashFunction(elems: Vector): Array[Vector] = { require(elems.numNonzeros > 0, "Must have at least 1 non zero entry.") @@ -105,6 +113,12 @@ class MinHashLSHModel private[ml]( @Since("2.1.0") class MinHashLSH(override val uid: String) extends LSH[MinHashLSHModel] with HasSeed { + @Since("2.1.0") + override def setInputCol(value: String): this.type = super.setInputCol(value) + + @Since("2.1.0") + override def setOutputCol(value: String): this.type = super.setOutputCol(value) + @Since("2.1.0") override def setNumHashTables(value: Int): this.type = super.setNumHashTables(value) diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index b55f3e1ca459e..6539e2abaed12 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -119,7 +119,9 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction ... [(0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)]) >>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"]) ... - >>> evaluator = BinaryClassificationEvaluator(rawPredictionCol="raw") + >>> evaluator = BinaryClassificationEvaluator() + >>> evaluator.setRawPredictionCol("raw") + BinaryClassificationEvaluator... >>> evaluator.evaluate(dataset) 0.70... >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"}) @@ -239,7 +241,9 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeigh ... (-25.98418959, -22.0), (30.69731842, 33.0), (74.69283752, 71.0)] >>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"]) ... - >>> evaluator = RegressionEvaluator(predictionCol="raw") + >>> evaluator = RegressionEvaluator() + >>> evaluator.setPredictionCol("raw") + RegressionEvaluator... >>> evaluator.evaluate(dataset) 2.842... >>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"}) @@ -360,7 +364,9 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio >>> scoreAndLabels = [(0.0, 0.0), (0.0, 1.0), (0.0, 0.0), ... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)] >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"]) - >>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") + >>> evaluator = MulticlassClassificationEvaluator() + >>> evaluator.setPredictionCol("prediction") + MulticlassClassificationEvaluator... >>> evaluator.evaluate(dataset) 0.66... >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"}) @@ -546,7 +552,9 @@ class MultilabelClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio ... ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])] >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"]) ... - >>> evaluator = MultilabelClassificationEvaluator(predictionCol="prediction") + >>> evaluator = MultilabelClassificationEvaluator() + >>> evaluator.setPredictionCol("prediction") + MultilabelClassificationEvaluator... >>> evaluator.evaluate(dataset) 0.63... >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"}) @@ -659,7 +667,9 @@ class ClusteringEvaluator(JavaEvaluator, HasPredictionCol, HasFeaturesCol, ... ([10.5, 11.5], 1.0), ([1.0, 1.0], 0.0), ([8.0, 6.0], 1.0)]) >>> dataset = spark.createDataFrame(featureAndPredictions, ["features", "prediction"]) ... - >>> evaluator = ClusteringEvaluator(predictionCol="prediction") + >>> evaluator = ClusteringEvaluator() + >>> evaluator.setPredictionCol("prediction") + ClusteringEvaluator... >>> evaluator.evaluate(dataset) 0.9079... >>> ce_path = temp_path + "/ce" @@ -759,7 +769,9 @@ class RankingEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, ... ([1.0, 2.0, 3.0, 4.0, 5.0], [])] >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"]) ... - >>> evaluator = RankingEvaluator(predictionCol="prediction") + >>> evaluator = RankingEvaluator() + >>> evaluator.setPredictionCol("prediction") + RankingEvaluator... >>> evaluator.evaluate(dataset) 0.35... >>> evaluator.evaluate(dataset, {evaluator.metricName: "precisionAtK", evaluator.k: 2}) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 9c34d98518e65..a73d30230e109 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -24,7 +24,8 @@ from pyspark.ml.linalg import _convert_to_vector from pyspark.ml.param.shared import * from pyspark.ml.util import JavaMLReadable, JavaMLWritable -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaTransformer, _jvm +from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaTransformer, \ + JavaUnaryTransformer, _jvm from pyspark.ml.common import inherit_doc __all__ = ['Binarizer', @@ -76,6 +77,12 @@ class Binarizer(JavaTransformer, HasThreshold, HasThresholds, HasInputCol, HasOu >>> df = spark.createDataFrame([(0.5,)], ["values"]) >>> binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features") + >>> binarizer.setThreshold(1.0) + Binarizer... + >>> binarizer.setInputCol("values") + Binarizer... + >>> binarizer.setOutputCol("features") + Binarizer... >>> binarizer.transform(df).head().features 0.0 >>> binarizer.setParams(outputCol="freqs").transform(df).head().freqs @@ -319,8 +326,15 @@ class BucketedRandomProjectionLSH(_LSH, _BucketedRandomProjectionLSHParams, ... (2, Vectors.dense([1.0, -1.0 ]),), ... (3, Vectors.dense([1.0, 1.0]),)] >>> df = spark.createDataFrame(data, ["id", "features"]) - >>> brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", - ... seed=12345, bucketLength=1.0) + >>> brp = BucketedRandomProjectionLSH() + >>> brp.setInputCol("features") + BucketedRandomProjectionLSH... + >>> brp.setOutputCol("hashes") + BucketedRandomProjectionLSH... + >>> brp.setSeed(12345) + BucketedRandomProjectionLSH... + >>> brp.setBucketLength(1.0) + BucketedRandomProjectionLSH... >>> model = brp.fit(df) >>> model.getBucketLength() 1.0 @@ -462,8 +476,13 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOu >>> values = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, float("nan")), ... (float("nan"), 1.0), (float("nan"), 0.0)] >>> df = spark.createDataFrame(values, ["values1", "values2"]) - >>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")], - ... inputCol="values1", outputCol="buckets") + >>> bucketizer = Bucketizer() + >>> bucketizer.setSplits([-float("inf"), 0.5, 1.4, float("inf")]) + Bucketizer... + >>> bucketizer.setInputCol("values1") + Bucketizer... + >>> bucketizer.setOutputCol("buckets") + Bucketizer... >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df).collect() >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df.select("values1")) >>> bucketed.show(truncate=False) @@ -709,7 +728,11 @@ class CountVectorizer(JavaEstimator, _CountVectorizerParams, JavaMLReadable, Jav >>> df = spark.createDataFrame( ... [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])], ... ["label", "raw"]) - >>> cv = CountVectorizer(inputCol="raw", outputCol="vectors") + >>> cv = CountVectorizer() + >>> cv.setInputCol("raw") + CountVectorizer... + >>> cv.setOutputCol("vectors") + CountVectorizer... >>> model = cv.fit(df) >>> model.transform(df).show(truncate=False) +-----+---------------+-------------------------+ @@ -907,7 +930,7 @@ def setBinary(self, value): @inherit_doc -class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class DCT(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): """ A feature transformer that takes the 1D discrete cosine transform of a real vector. No zero padding is performed on the input vector. @@ -920,7 +943,13 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit >>> from pyspark.ml.linalg import Vectors >>> df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"]) - >>> dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec") + >>> dct = DCT( ) + >>> dct.setInverse(False) + DCT... + >>> dct.setInputCol("vec") + DCT... + >>> dct.setOutputCol("resultVec") + DCT... >>> df2 = dct.transform(df1) >>> df2.head().resultVec DenseVector([10.969..., -0.707..., -2.041...]) @@ -976,8 +1005,7 @@ def getInverse(self): @inherit_doc -class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, - JavaMLWritable): +class ElementwiseProduct(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): """ Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a provided "weight" vector. In other words, it scales each column of the dataset @@ -985,8 +1013,13 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReada >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"]) - >>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]), - ... inputCol="values", outputCol="eprod") + >>> ep = ElementwiseProduct() + >>> ep.setScalingVec(Vectors.dense([1.0, 2.0, 3.0])) + ElementwiseProduct... + >>> ep.setInputCol("values") + ElementwiseProduct... + >>> ep.setOutputCol("eprod") + ElementwiseProduct... >>> ep.transform(df).head().eprod DenseVector([2.0, 2.0, 9.0]) >>> ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod @@ -1077,7 +1110,11 @@ class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, >>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")] >>> cols = ["real", "bool", "stringNum", "string"] >>> df = spark.createDataFrame(data, cols) - >>> hasher = FeatureHasher(inputCols=cols, outputCol="features") + >>> hasher = FeatureHasher() + >>> hasher.setInputCols(cols) + FeatureHasher... + >>> hasher.setOutputCol("features") + FeatureHasher... >>> hasher.transform(df).head().features SparseVector(262144, {174475: 2.0, 247670: 1.0, 257907: 1.0, 262126: 1.0}) >>> hasher.setCategoricalCols(["real"]).transform(df).head().features @@ -1163,7 +1200,9 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java otherwise the features will not be mapped evenly to the columns. >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["words"]) - >>> hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features") + >>> hashingTF = HashingTF(inputCol="words", outputCol="features") + >>> hashingTF.setNumFeatures(10) + HashingTF... >>> hashingTF.transform(df).head().features SparseVector(10, {5: 1.0, 7: 1.0, 8: 1.0}) >>> hashingTF.setParams(outputCol="freqs").transform(df).head().freqs @@ -1276,7 +1315,11 @@ class IDF(JavaEstimator, _IDFParams, JavaMLReadable, JavaMLWritable): >>> from pyspark.ml.linalg import DenseVector >>> df = spark.createDataFrame([(DenseVector([1.0, 2.0]),), ... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"]) - >>> idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf") + >>> idf = IDF(minDocFreq=3) + >>> idf.setInputCol("tf") + IDF... + >>> idf.setOutputCol("idf") + IDF... >>> model = idf.fit(df) >>> model.getMinDocFreq() 3 @@ -1444,7 +1487,11 @@ class Imputer(JavaEstimator, _ImputerParams, JavaMLReadable, JavaMLWritable): >>> df = spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")), (float("nan"), 3.0), ... (4.0, 4.0), (5.0, 5.0)], ["a", "b"]) - >>> imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"]) + >>> imputer = Imputer() + >>> imputer.setInputCols(["a", "b"]) + Imputer... + >>> imputer.setOutputCols(["out_a", "out_b"]) + Imputer... >>> model = imputer.fit(df) >>> model.getStrategy() 'mean' @@ -1586,7 +1633,11 @@ class Interaction(JavaTransformer, HasInputCols, HasOutputCol, JavaMLReadable, J with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`. >>> df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], ["a", "b"]) - >>> interaction = Interaction(inputCols=["a", "b"], outputCol="ab") + >>> interaction = Interaction() + >>> interaction.setInputCols(["a", "b"]) + Interaction... + >>> interaction.setOutputCol("ab") + Interaction... >>> interaction.transform(df).show() +---+---+-----+ | a| b| ab| @@ -1658,7 +1709,9 @@ class MaxAbsScaler(JavaEstimator, _MaxAbsScalerParams, JavaMLReadable, JavaMLWri >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) - >>> maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled") + >>> maScaler = MaxAbsScaler(outputCol="scaled") + >>> maScaler.setInputCol("a") + MaxAbsScaler... >>> model = maScaler.fit(df) >>> model.setOutputCol("scaledOutput") MaxAbsScaler... @@ -1771,7 +1824,13 @@ class MinHashLSH(_LSH, HasInputCol, HasOutputCol, HasSeed, JavaMLReadable, JavaM ... (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),), ... (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)] >>> df = spark.createDataFrame(data, ["id", "features"]) - >>> mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345) + >>> mh = MinHashLSH() + >>> mh.setInputCol("features") + MinHashLSH... + >>> mh.setOutputCol("hashes") + MinHashLSH... + >>> mh.setSeed(12345) + MinHashLSH... >>> model = mh.fit(df) >>> model.transform(df).head() Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), hashes=[DenseVector([6179668... @@ -1896,7 +1955,9 @@ class MinMaxScaler(JavaEstimator, _MinMaxScalerParams, JavaMLReadable, JavaMLWri >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) - >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled") + >>> mmScaler = MinMaxScaler(outputCol="scaled") + >>> mmScaler.setInputCol("a") + MinMaxScaler... >>> model = mmScaler.fit(df) >>> model.setOutputCol("scaledOutput") MinMaxScaler... @@ -2035,7 +2096,7 @@ def originalMax(self): @inherit_doc @ignore_unicode_prefix -class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class NGram(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): """ A feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. @@ -2046,7 +2107,11 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWr returned. >>> df = spark.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])]) - >>> ngram = NGram(n=2, inputCol="inputTokens", outputCol="nGrams") + >>> ngram = NGram(n=2) + >>> ngram.setInputCol("inputTokens") + NGram... + >>> ngram.setOutputCol("nGrams") + NGram... >>> ngram.transform(df).head() Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b', u'b c', u'c d', u'd e']) >>> # Change n-gram length @@ -2111,14 +2176,18 @@ def getN(self): @inherit_doc -class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class Normalizer(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): """ Normalize a vector to have unit norm using the given p-norm. >>> from pyspark.ml.linalg import Vectors >>> svec = Vectors.sparse(4, {1: 4.0, 3: 3.0}) >>> df = spark.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"]) - >>> normalizer = Normalizer(p=2.0, inputCol="dense", outputCol="features") + >>> normalizer = Normalizer(p=2.0) + >>> normalizer.setInputCol("dense") + Normalizer... + >>> normalizer.setOutputCol("features") + Normalizer... >>> normalizer.transform(df).head().features DenseVector([0.6, -0.8]) >>> normalizer.setParams(inputCol="sparse", outputCol="freqs").transform(df).head().freqs @@ -2225,7 +2294,11 @@ class OneHotEncoder(JavaEstimator, _OneHotEncoderParams, JavaMLReadable, JavaMLW >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"]) - >>> ohe = OneHotEncoder(inputCols=["input"], outputCols=["output"]) + >>> ohe = OneHotEncoder() + >>> ohe.setInputCols(["input"]) + OneHotEncoder... + >>> ohe.setOutputCols(["output"]) + OneHotEncoder... >>> model = ohe.fit(df) >>> model.getHandleInvalid() 'error' @@ -2345,8 +2418,7 @@ def categorySizes(self): @inherit_doc -class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, - JavaMLWritable): +class PolynomialExpansion(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): """ Perform feature expansion in a polynomial space. As said in `wikipedia of Polynomial Expansion `_, "In mathematics, an @@ -2356,7 +2428,11 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([(Vectors.dense([0.5, 2.0]),)], ["dense"]) - >>> px = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded") + >>> px = PolynomialExpansion(degree=2) + >>> px.setInputCol("dense") + PolynomialExpansion... + >>> px.setOutputCol("expanded") + PolynomialExpansion... >>> px.transform(df).head().expanded DenseVector([0.5, 0.25, 2.0, 1.0, 4.0]) >>> px.setParams(outputCol="test").transform(df).head().test @@ -2439,8 +2515,13 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasInputCols >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)] >>> df1 = spark.createDataFrame(values, ["values"]) - >>> qds1 = QuantileDiscretizer(numBuckets=2, - ... inputCol="values", outputCol="buckets", relativeError=0.01, handleInvalid="error") + >>> qds1 = QuantileDiscretizer(inputCol="values", outputCol="buckets") + >>> qds1.setNumBuckets(2) + QuantileDiscretizer... + >>> qds1.setRelativeError(0.01) + QuantileDiscretizer... + >>> qds1.setHandleInvalid("error") + QuantileDiscretizer... >>> qds1.getRelativeError() 0.01 >>> bucketizer = qds1.fit(df1) @@ -2703,7 +2784,11 @@ class RobustScaler(JavaEstimator, _RobustScalerParams, JavaMLReadable, JavaMLWri ... (3, Vectors.dense([3.0, -3.0]),), ... (4, Vectors.dense([4.0, -4.0]),),] >>> df = spark.createDataFrame(data, ["id", "features"]) - >>> scaler = RobustScaler(inputCol="features", outputCol="scaled") + >>> scaler = RobustScaler() + >>> scaler.setInputCol("features") + RobustScaler... + >>> scaler.setOutputCol("scaled") + RobustScaler... >>> model = scaler.fit(df) >>> model.setOutputCol("output") RobustScaler... @@ -2842,7 +2927,7 @@ def range(self): @inherit_doc @ignore_unicode_prefix -class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class RegexTokenizer(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): """ A regex based tokenizer that extracts tokens either by using the provided regex pattern (in Java dialect) to split the text @@ -2852,7 +2937,11 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, It returns an array of strings that can be empty. >>> df = spark.createDataFrame([("A B c",)], ["text"]) - >>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words") + >>> reTokenizer = RegexTokenizer() + >>> reTokenizer.setInputCol("text") + RegexTokenizer... + >>> reTokenizer.setOutputCol("words") + RegexTokenizer... >>> reTokenizer.transform(df).head() Row(text=u'A B c', words=[u'a', u'b', u'c']) >>> # Change a parameter. @@ -3068,7 +3157,11 @@ class StandardScaler(JavaEstimator, _StandardScalerParams, JavaMLReadable, JavaM >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) - >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled") + >>> standardScaler = StandardScaler() + >>> standardScaler.setInputCol("a") + StandardScaler... + >>> standardScaler.setOutputCol("scaled") + StandardScaler... >>> model = standardScaler.fit(df) >>> model.getInputCol() 'a' @@ -3228,8 +3321,10 @@ class StringIndexer(JavaEstimator, _StringIndexerParams, JavaMLReadable, JavaMLW so the most frequent label gets index 0. The ordering behavior is controlled by setting :py:attr:`stringOrderType`. Its default value is 'frequencyDesc'. - >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error", + >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", ... stringOrderType="frequencyDesc") + >>> stringIndexer.setHandleInvalid("error") + StringIndexer... >>> model = stringIndexer.fit(stringIndDf) >>> td = model.transform(stringIndDf) >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), @@ -3522,7 +3617,11 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadabl .. note:: null values from input array are preserved unless adding null to stopWords explicitly. >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["text"]) - >>> remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"]) + >>> remover = StopWordsRemover(stopWords=["b"]) + >>> remover.setInputCol("text") + StopWordsRemover... + >>> remover.setOutputCol("words") + StopWordsRemover... >>> remover.transform(df).head().words == ['a', 'c'] True >>> stopWordsRemoverPath = temp_path + "/stopwords-remover" @@ -3638,13 +3737,15 @@ def loadDefaultStopWords(language): @inherit_doc @ignore_unicode_prefix -class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class Tokenizer(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): """ A tokenizer that converts the input string to lowercase and then splits it by white spaces. >>> df = spark.createDataFrame([("a b c",)], ["text"]) - >>> tokenizer = Tokenizer(inputCol="text", outputCol="words") + >>> tokenizer = Tokenizer(outputCol="words") + >>> tokenizer.setInputCol("text") + Tokenizer... >>> tokenizer.transform(df).head() Row(text=u'a b c', words=[u'a', u'b', u'c']) >>> # Change a parameter. @@ -3697,7 +3798,9 @@ class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol, HasHandleInva A feature transformer that merges multiple columns into a vector column. >>> df = spark.createDataFrame([(1, 0, 3)], ["a", "b", "c"]) - >>> vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features") + >>> vecAssembler = VectorAssembler(outputCol="features") + >>> vecAssembler.setInputCols(["a", "b", "c"]) + VectorAssembler... >>> vecAssembler.transform(df).head().features DenseVector([1.0, 0.0, 3.0]) >>> vecAssembler.setParams(outputCol="freqs").transform(df).head().freqs @@ -3851,7 +3954,9 @@ class VectorIndexer(JavaEstimator, _VectorIndexerParams, JavaMLReadable, JavaMLW >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([(Vectors.dense([-1.0, 0.0]),), ... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"]) - >>> indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed") + >>> indexer = VectorIndexer(maxCategories=2, inputCol="a") + >>> indexer.setOutputCol("indexed") + VectorIndexer... >>> model = indexer.fit(df) >>> indexer.getHandleInvalid() 'error' @@ -4012,7 +4117,9 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, J ... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),), ... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),), ... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"]) - >>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4]) + >>> vs = VectorSlicer(outputCol="sliced", indices=[1, 4]) + >>> vs.setInputCol("features") + VectorSlicer... >>> vs.transform(df).head().sliced DenseVector([2.3, 1.0]) >>> vectorSlicerPath = temp_path + "/vector-slicer" @@ -4395,7 +4502,9 @@ class PCA(JavaEstimator, _PCAParams, JavaMLReadable, JavaMLWritable): ... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), ... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] >>> df = spark.createDataFrame(data,["features"]) - >>> pca = PCA(k=2, inputCol="features", outputCol="pca_features") + >>> pca = PCA(k=2, inputCol="features") + >>> pca.setOutputCol("pca_features") + PCA... >>> model = pca.fit(df) >>> model.getK() 2 diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index ca967ea40f3cd..08e68d8bc3044 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -749,7 +749,9 @@ class DecisionTreeRegressor(JavaPredictor, _DecisionTreeRegressorParams, JavaMLW >>> df = spark.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance") + >>> dt = DecisionTreeRegressor(maxDepth=2) + >>> dt.setVarianceCol("variance") + DecisionTreeRegressor... >>> model = dt.fit(df) >>> model.getVarianceCol() 'variance' @@ -991,7 +993,9 @@ class RandomForestRegressor(JavaPredictor, _RandomForestRegressorParams, JavaMLW >>> df = spark.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42) + >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2) + >>> rf.setSeed(42) + RandomForestRegressor... >>> model = rf.fit(df) >>> model.getSeed() 42 diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 47e4921541ea2..d8e6c0047dda9 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -23,7 +23,7 @@ from pyspark import since from pyspark import SparkContext from pyspark.sql import DataFrame -from pyspark.ml import Estimator, Transformer, Model +from pyspark.ml import Estimator, Transformer, Model, UnaryTransformer from pyspark.ml.param import Params from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol from pyspark.ml.util import _jvm @@ -338,6 +338,21 @@ def _transform(self, dataset): return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx) +@inherit_doc +class JavaUnaryTransformer(JavaParams, UnaryTransformer): + """ + Base class for :py:class:`UnaryTransformer`s that wrap Java/Scala + implementations. Subclasses should ensure they have the transformer Java object + available as _java_obj. + """ + + __metaclass__ = ABCMeta + + def _transform(self, dataset): + self._transfer_params_to_java() + return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx) + + @inherit_doc class JavaModel(JavaTransformer, Model): """ From bff5a70e2bdcd23fccdb03647fd70f0197440cac Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 23 Oct 2019 21:48:58 -0700 Subject: [PATCH 3/4] not using UnaryTransformer for now --- python/pyspark/ml/feature.py | 103 ++++++++++++++++++++++++++++++++--- python/pyspark/ml/wrapper.py | 19 +------ 2 files changed, 95 insertions(+), 27 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index a73d30230e109..11bb7941b5d9a 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -24,8 +24,7 @@ from pyspark.ml.linalg import _convert_to_vector from pyspark.ml.param.shared import * from pyspark.ml.util import JavaMLReadable, JavaMLWritable -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaTransformer, \ - JavaUnaryTransformer, _jvm +from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaTransformer, _jvm from pyspark.ml.common import inherit_doc __all__ = ['Binarizer', @@ -930,7 +929,7 @@ def setBinary(self, value): @inherit_doc -class DCT(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): +class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ A feature transformer that takes the 1D discrete cosine transform of a real vector. No zero padding is performed on the input vector. @@ -1003,9 +1002,22 @@ def getInverse(self): """ return self.getOrDefault(self.inverse) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @inherit_doc -class ElementwiseProduct(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): +class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, + JavaMLWritable): """ Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a provided "weight" vector. In other words, it scales each column of the dataset @@ -1071,6 +1083,18 @@ def getScalingVec(self): """ return self.getOrDefault(self.scalingVec) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @inherit_doc class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, JavaMLReadable, @@ -2096,7 +2120,7 @@ def originalMax(self): @inherit_doc @ignore_unicode_prefix -class NGram(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): +class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ A feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. @@ -2174,9 +2198,21 @@ def getN(self): """ return self.getOrDefault(self.n) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @inherit_doc -class Normalizer(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): +class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ Normalize a vector to have unit norm using the given p-norm. @@ -2242,6 +2278,18 @@ def getP(self): """ return self.getOrDefault(self.p) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + class _OneHotEncoderParams(HasInputCols, HasOutputCols, HasHandleInvalid): """ @@ -2418,7 +2466,8 @@ def categorySizes(self): @inherit_doc -class PolynomialExpansion(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): +class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, + JavaMLWritable): """ Perform feature expansion in a polynomial space. As said in `wikipedia of Polynomial Expansion `_, "In mathematics, an @@ -2485,6 +2534,18 @@ def getDegree(self): """ return self.getOrDefault(self.degree) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @inherit_doc class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasInputCols, HasOutputCols, @@ -2927,7 +2988,7 @@ def range(self): @inherit_doc @ignore_unicode_prefix -class RegexTokenizer(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): +class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ A regex based tokenizer that extracts tokens either by using the provided regex pattern (in Java dialect) to split the text @@ -3058,6 +3119,18 @@ def getToLowercase(self): """ return self.getOrDefault(self.toLowercase) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @inherit_doc class SQLTransformer(JavaTransformer, JavaMLReadable, JavaMLWritable): @@ -3737,7 +3810,7 @@ def loadDefaultStopWords(language): @inherit_doc @ignore_unicode_prefix -class Tokenizer(JavaUnaryTransformer, JavaMLReadable, JavaMLWritable): +class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ A tokenizer that converts the input string to lowercase and then splits it by white spaces. @@ -3790,6 +3863,18 @@ def setParams(self, inputCol=None, outputCol=None): kwargs = self._input_kwargs return self._set(**kwargs) + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + return self._set(outputCol=value) + @inherit_doc class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol, HasHandleInvalid, JavaMLReadable, diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index d8e6c0047dda9..2e2f43018cb08 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -23,7 +23,7 @@ from pyspark import since from pyspark import SparkContext from pyspark.sql import DataFrame -from pyspark.ml import Estimator, Transformer, Model, UnaryTransformer +from pyspark.ml import Estimator, Transformer, Model from pyspark.ml.param import Params from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol from pyspark.ml.util import _jvm @@ -331,23 +331,6 @@ class JavaTransformer(JavaParams, Transformer): available as _java_obj. """ - __metaclass__ = ABCMeta - - def _transform(self, dataset): - self._transfer_params_to_java() - return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx) - - -@inherit_doc -class JavaUnaryTransformer(JavaParams, UnaryTransformer): - """ - Base class for :py:class:`UnaryTransformer`s that wrap Java/Scala - implementations. Subclasses should ensure they have the transformer Java object - available as _java_obj. - """ - - __metaclass__ = ABCMeta - def _transform(self, dataset): self._transfer_params_to_java() return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx) From 03c2e4a6c7581402de0c1067c31f5655d9992364 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 24 Oct 2019 07:50:06 -0700 Subject: [PATCH 4/4] address comments --- python/pyspark/ml/wrapper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 2e2f43018cb08..47e4921541ea2 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -331,6 +331,8 @@ class JavaTransformer(JavaParams, Transformer): available as _java_obj. """ + __metaclass__ = ABCMeta + def _transform(self, dataset): self._transfer_params_to_java() return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)