From 25258aeff218e2944208aa62643fe4b3e801e219 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 26 Oct 2016 17:48:20 +0800 Subject: [PATCH 01/20] create pr --- .../org/apache/spark/ml/regression/LinearRegression.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 534ef87ec64ee..23081bfe23613 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -204,6 +204,10 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String Instance(label, weight, features) } + val instr = Instrumentation.create(this, instances) + instr.logParams(params : _*) + instr.logNumFeatures(numFeatures) + if (($(solver) == "auto" && numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == "normal") { // For low dimensional data, WeightedLeastSquares is more efficient since the @@ -225,6 +229,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String summaryModel, model.diagInvAtWA.toArray, model.objectiveHistory) + instr.logSuccess(lrModel) return lrModel.setSummary(Some(trainingSummary)) } @@ -279,6 +284,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String model, Array(0D), Array(0D)) + instr.logSuccess(model) return model.setSummary(Some(trainingSummary)) } else { require($(regParam) == 0.0, "The standard deviation of the label is zero. " + @@ -401,6 +407,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String model, Array(0D), objectiveHistory) + instr.logSuccess(model) model.setSummary(Some(trainingSummary)) } From 2bc4e29813499a8643b30d14538ecf1431ff1f55 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 27 Oct 2016 13:26:17 +0800 Subject: [PATCH 02/20] fix nits --- .../org/apache/spark/ml/regression/LinearRegression.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 23081bfe23613..ef04aec12d046 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -91,7 +91,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String setDefault(regParam -> 0.0) /** - * Set if we should fit the intercept + * Set if we should fit the intercept. * Default is true. * * @group setParam @@ -256,10 +256,10 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String val rawYStd = math.sqrt(ySummarizer.variance(0)) if (rawYStd == 0.0) { if ($(fitIntercept) || yMean == 0.0) { - // If the rawYStd is zero and fitIntercept=true, then the intercept is yMean with + // If the rawYStd==0 and fitIntercept==true, then the intercept is yMean with // zero coefficient; as a result, training is not needed. // Also, if yMean==0 and rawYStd==0, all the coefficients are zero regardless of - // the fitIntercept + // the fitIntercept. if (yMean == 0.0) { logWarning(s"Mean and standard deviation of the label are zero, so the coefficients " + s"and the intercept will all be zero; as a result, training is not needed.") From b0b2cc30884eba4cda72b67e59609d38762708e7 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 28 Oct 2016 15:45:22 +0800 Subject: [PATCH 03/20] add instr for regression --- .../spark/ml/regression/AFTSurvivalRegression.scala | 5 +++++ .../spark/ml/regression/GeneralizedLinearRegression.scala | 5 +++++ .../apache/spark/ml/regression/IsotonicRegression.scala | 8 +++++++- .../org/apache/spark/ml/regression/LinearRegression.scala | 3 +-- 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index af68e7b9d5809..5344914c1f300 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -227,6 +227,10 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt) val numFeatures = featuresStd.size + val instr = Instrumentation.create(this, dataset) + instr.logParams(params : _*) + instr.logNumFeatures(numFeatures) + if (!$(fitIntercept) && (0 until numFeatures).exists { i => featuresStd(i) == 0.0 && featuresSummarizer.mean(i) != 0.0 }) { logWarning("Fitting AFTSurvivalRegressionModel without intercept on dataset with " + @@ -277,6 +281,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val intercept = parameters(1) val scale = math.exp(parameters(0)) val model = new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale) + instr.logSuccess(model) copyValues(model.setParent(this)) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 3891ae63a4e1e..589cab2b97f72 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -251,6 +251,10 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val val familyAndLink = new FamilyAndLink(familyObj, linkObj) val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size + val instr = Instrumentation.create(this, dataset) + instr.logParams(params : _*) + instr.logNumFeatures(numFeatures) + if (numFeatures > WeightedLeastSquares.MAX_NUM_FEATURES) { val msg = "Currently, GeneralizedLinearRegression only supports number of features" + s" <= ${WeightedLeastSquares.MAX_NUM_FEATURES}. Found $numFeatures in the input dataset." @@ -288,6 +292,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val .setParent(this)) val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model, irlsModel.diagInvAtWA.toArray, irlsModel.numIterations, getSolver) + instr.logSuccess(model) model.setSummary(Some(trainingSummary)) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala index c378a99e3c230..be16cf2f66f9b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala @@ -171,10 +171,16 @@ class IsotonicRegression @Since("1.5.0") (@Since("1.5.0") override val uid: Stri val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK) + val instr = Instrumentation.create(this, dataset) + instr.logParams(params : _*) + instr.logNumFeatures(1) + val isotonicRegression = new MLlibIsotonicRegression().setIsotonic($(isotonic)) val oldModel = isotonicRegression.run(instances) - copyValues(new IsotonicRegressionModel(uid, oldModel).setParent(this)) + val model = copyValues(new IsotonicRegressionModel(uid, oldModel).setParent(this)) + instr.logSuccess(model) + model } @Since("1.5.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index ef04aec12d046..764c6dad634a6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -204,7 +204,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String Instance(label, weight, features) } - val instr = Instrumentation.create(this, instances) + val instr = Instrumentation.create(this, dataset) instr.logParams(params : _*) instr.logNumFeatures(numFeatures) @@ -230,7 +230,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String model.diagInvAtWA.toArray, model.objectiveHistory) instr.logSuccess(lrModel) - return lrModel.setSummary(Some(trainingSummary)) } From 41aff476653b915dad9b45a9fceaaa066e0a600b Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 28 Oct 2016 16:49:40 +0800 Subject: [PATCH 04/20] add instr for algos --- .../MultilayerPerceptronClassifier.scala | 11 ++++++++++- .../apache/spark/ml/classification/NaiveBayes.scala | 10 +++++++++- .../scala/org/apache/spark/ml/clustering/LDA.scala | 9 ++++++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index aaaf7df34576a..96057b948b8de 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -232,8 +232,13 @@ class MultilayerPerceptronClassifier @Since("1.5.0") ( * @return Fitted model */ override protected def train(dataset: Dataset[_]): MultilayerPerceptronClassificationModel = { + val instr = Instrumentation.create(this, dataset) + instr.logParams(params : _*) + val myLayers = $(layers) val labels = myLayers.last + instr.logNumClasses(labels) + val lpData = extractLabeledPoints(dataset) val data = lpData.map(lp => LabelConverter.encodeLabeledPoint(lp, labels)) val topology = FeedForwardTopology.multiLayerPerceptron(myLayers, softmaxOnTop = true) @@ -258,7 +263,11 @@ class MultilayerPerceptronClassifier @Since("1.5.0") ( } trainer.setStackSize($(blockSize)) val mlpModel = trainer.train(data) - new MultilayerPerceptronClassificationModel(uid, myLayers, mlpModel.weights) + val model = new MultilayerPerceptronClassificationModel(uid, myLayers, mlpModel.weights) + + instr.logNumFeatures(model.numFeatures) + instr.logSuccess(model) + model } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index e90040dbf182b..70d744c109360 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -147,7 +147,11 @@ class NaiveBayes @Since("1.5.0") ( } } + val instr = Instrumentation.create(this, dataset) + instr.logParams(params : _*) + val numFeatures = dataset.select(col($(featuresCol))).head().getAs[Vector](0).size + instr.logNumFeatures(numFeatures) val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol)) // Aggregates term frequencies per label. @@ -169,6 +173,7 @@ class NaiveBayes @Since("1.5.0") ( }).collect().sortBy(_._1) val numLabels = aggregated.length + instr.logNumClasses(numLabels) val numDocuments = aggregated.map(_._2._1).sum val labelArray = new Array[Double](numLabels) @@ -198,7 +203,10 @@ class NaiveBayes @Since("1.5.0") ( val pi = Vectors.dense(piArray) val theta = new DenseMatrix(numLabels, numFeatures, thetaArray, true) - new NaiveBayesModel(uid, pi, theta).setOldLabels(labelArray) + val model = new NaiveBayesModel(uid, pi, theta).setOldLabels(labelArray) + + instr.logSuccess(model) + model } @Since("1.5.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 583e5e0928eba..e76dc3585da9d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -888,6 +888,10 @@ class LDA @Since("1.6.0") ( @Since("2.0.0") override def fit(dataset: Dataset[_]): LDAModel = { transformSchema(dataset.schema, logging = true) + + val instr = Instrumentation.create(this, dataset) + instr.logParams(params : _*) + val oldLDA = new OldLDA() .setK($(k)) .setDocConcentration(getOldDocConcentration) @@ -905,7 +909,10 @@ class LDA @Since("1.6.0") ( case m: OldDistributedLDAModel => new DistributedLDAModel(uid, m.vocabSize, m, dataset.sparkSession, None) } - copyValues(newModel).setParent(this) + val m = copyValues(newModel).setParent(this) + + instr.logSuccess(m) + m } @Since("1.6.0") From 5dc6b43f84a679bce83e0e2592580009a6d0f4ed Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 28 Oct 2016 18:08:41 +0800 Subject: [PATCH 05/20] update --- .../spark/ml/classification/DecisionTreeClassifier.scala | 2 ++ .../classification/MultilayerPerceptronClassifier.scala | 2 +- .../org/apache/spark/ml/classification/OneVsRest.scala | 8 ++++++++ .../spark/ml/classification/RandomForestClassifier.scala | 2 ++ .../main/scala/org/apache/spark/ml/clustering/LDA.scala | 6 +++--- 5 files changed, 16 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 9f60f0896ec52..447b1a972231d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -112,11 +112,13 @@ class DecisionTreeClassifier @Since("1.4.0") ( val instr = Instrumentation.create(this, oldDataset) instr.logParams(params: _*) + instr.logNumClasses(numClasses) val trees = RandomForest.run(oldDataset, strategy, numTrees = 1, featureSubsetStrategy = "all", seed = $(seed), instr = Some(instr), parentUID = Some(uid)) val m = trees.head.asInstanceOf[DecisionTreeClassificationModel] + instr.logNumFeatures(m.numFeatures) instr.logSuccess(m) m } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 96057b948b8de..93ed7affe292b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -238,6 +238,7 @@ class MultilayerPerceptronClassifier @Since("1.5.0") ( val myLayers = $(layers) val labels = myLayers.last instr.logNumClasses(labels) + instr.logNumFeatures(myLayers.head) val lpData = extractLabeledPoints(dataset) val data = lpData.map(lp => LabelConverter.encodeLabeledPoint(lp, labels)) @@ -265,7 +266,6 @@ class MultilayerPerceptronClassifier @Since("1.5.0") ( val mlpModel = trainer.train(data) val model = new MultilayerPerceptronClassificationModel(uid, myLayers, mlpModel.weights) - instr.logNumFeatures(model.numFeatures) instr.logSuccess(model) model } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala index e58b30d66588c..8978caa1cf7d9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala @@ -317,6 +317,11 @@ final class OneVsRest @Since("1.4.0") ( } val numClasses = MetadataUtils.getNumClasses(labelSchema).fold(computeNumClasses())(identity) + val instr = Instrumentation.create(this, dataset) + instr.logParams(params : _*) + instr.logParams($(classifier).params : _*) + instr.logNumClasses(numClasses) + val multiclassLabeled = dataset.select($(labelCol), $(featuresCol)) // persist if underlying dataset is not persistent. @@ -352,6 +357,9 @@ final class OneVsRest @Since("1.4.0") ( case attr: Attribute => attr } val model = new OneVsRestModel(uid, labelAttribute.toMetadata(), models).setParent(this) + + instr.logNumFeatures(models.head.numFeatures) + instr.logSuccess(model) copyValues(model) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 5bbaafeff329f..d2df9a1b548cf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -132,6 +132,7 @@ class RandomForestClassifier @Since("1.4.0") ( val instr = Instrumentation.create(this, oldDataset) instr.logParams(params: _*) + instr.logNumClasses(numClasses) val trees = RandomForest .run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed, Some(instr)) @@ -139,6 +140,7 @@ class RandomForestClassifier @Since("1.4.0") ( val numFeatures = oldDataset.first().features.size val m = new RandomForestClassificationModel(trees, numFeatures, numClasses) + instr.logNumFeatures(m.numFeatures) instr.logSuccess(m) m } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index e76dc3585da9d..41b1c192c798d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -909,10 +909,10 @@ class LDA @Since("1.6.0") ( case m: OldDistributedLDAModel => new DistributedLDAModel(uid, m.vocabSize, m, dataset.sparkSession, None) } - val m = copyValues(newModel).setParent(this) - instr.logSuccess(m) - m + val model = copyValues(newModel).setParent(this) + instr.logSuccess(newModel) + model } @Since("1.6.0") From 457196affcc7356389dcd45a20cdc3ed668dfdc1 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 28 Oct 2016 19:15:34 +0800 Subject: [PATCH 06/20] remove cla.params --- .../scala/org/apache/spark/ml/classification/OneVsRest.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala index 8978caa1cf7d9..a0a5e7b521c24 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala @@ -319,7 +319,6 @@ final class OneVsRest @Since("1.4.0") ( val instr = Instrumentation.create(this, dataset) instr.logParams(params : _*) - instr.logParams($(classifier).params : _*) instr.logNumClasses(numClasses) val multiclassLabeled = dataset.select($(labelCol), $(featuresCol)) From 22c9f0c1d941a3e27fb7ed7410e658503a9f07fc Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 28 Oct 2016 19:24:27 +0800 Subject: [PATCH 07/20] add numfea in rf and dt --- .../org/apache/spark/ml/regression/DecisionTreeRegressor.scala | 1 + .../org/apache/spark/ml/regression/RandomForestRegressor.scala | 1 + 2 files changed, 2 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 01c5cc1c7efa9..5770b81cd7c41 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -112,6 +112,7 @@ class DecisionTreeRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S seed = $(seed), instr = Some(instr), parentUID = Some(uid)) val m = trees.head.asInstanceOf[DecisionTreeRegressionModel] + instr.logNumFeatures(m.numFeatures) instr.logSuccess(m) m } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index ca4a50b825dde..f3f1f93ea18ec 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -130,6 +130,7 @@ class RandomForestRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S val numFeatures = oldDataset.first().features.size val m = new RandomForestRegressionModel(trees, numFeatures) + instr.logNumFeatures(m.numFeatures) instr.logSuccess(m) m } From ff49d7018fd039fa89875670a0e701c3aa2aac64 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 28 Oct 2016 20:04:35 +0800 Subject: [PATCH 08/20] revert OvR --- .../org/apache/spark/ml/classification/OneVsRest.scala | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala index a0a5e7b521c24..e58b30d66588c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala @@ -317,10 +317,6 @@ final class OneVsRest @Since("1.4.0") ( } val numClasses = MetadataUtils.getNumClasses(labelSchema).fold(computeNumClasses())(identity) - val instr = Instrumentation.create(this, dataset) - instr.logParams(params : _*) - instr.logNumClasses(numClasses) - val multiclassLabeled = dataset.select($(labelCol), $(featuresCol)) // persist if underlying dataset is not persistent. @@ -356,9 +352,6 @@ final class OneVsRest @Since("1.4.0") ( case attr: Attribute => attr } val model = new OneVsRestModel(uid, labelAttribute.toMetadata(), models).setParent(this) - - instr.logNumFeatures(models.head.numFeatures) - instr.logSuccess(model) copyValues(model) } From e2a1546c42f3bebbb1f913caf5691871ccef5cc4 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Sat, 29 Oct 2016 10:56:13 +0800 Subject: [PATCH 09/20] revert trees --- .../apache/spark/ml/classification/DecisionTreeClassifier.scala | 2 -- .../apache/spark/ml/classification/RandomForestClassifier.scala | 2 -- .../org/apache/spark/ml/regression/DecisionTreeRegressor.scala | 1 - .../org/apache/spark/ml/regression/RandomForestRegressor.scala | 1 - 4 files changed, 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 447b1a972231d..9f60f0896ec52 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -112,13 +112,11 @@ class DecisionTreeClassifier @Since("1.4.0") ( val instr = Instrumentation.create(this, oldDataset) instr.logParams(params: _*) - instr.logNumClasses(numClasses) val trees = RandomForest.run(oldDataset, strategy, numTrees = 1, featureSubsetStrategy = "all", seed = $(seed), instr = Some(instr), parentUID = Some(uid)) val m = trees.head.asInstanceOf[DecisionTreeClassificationModel] - instr.logNumFeatures(m.numFeatures) instr.logSuccess(m) m } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index d2df9a1b548cf..5bbaafeff329f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -132,7 +132,6 @@ class RandomForestClassifier @Since("1.4.0") ( val instr = Instrumentation.create(this, oldDataset) instr.logParams(params: _*) - instr.logNumClasses(numClasses) val trees = RandomForest .run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed, Some(instr)) @@ -140,7 +139,6 @@ class RandomForestClassifier @Since("1.4.0") ( val numFeatures = oldDataset.first().features.size val m = new RandomForestClassificationModel(trees, numFeatures, numClasses) - instr.logNumFeatures(m.numFeatures) instr.logSuccess(m) m } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 5770b81cd7c41..01c5cc1c7efa9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -112,7 +112,6 @@ class DecisionTreeRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S seed = $(seed), instr = Some(instr), parentUID = Some(uid)) val m = trees.head.asInstanceOf[DecisionTreeRegressionModel] - instr.logNumFeatures(m.numFeatures) instr.logSuccess(m) m } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index f3f1f93ea18ec..ca4a50b825dde 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -130,7 +130,6 @@ class RandomForestRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S val numFeatures = oldDataset.first().features.size val m = new RandomForestRegressionModel(trees, numFeatures) - instr.logNumFeatures(m.numFeatures) instr.logSuccess(m) m } From 867f08b39fb0838864b34953478d0622e0b1b2ca Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 1 Nov 2016 14:56:23 +0800 Subject: [PATCH 10/20] list params; correct logSuccess place; update comment of logSuccess --- .../classification/MultilayerPerceptronClassifier.scala | 3 ++- .../org/apache/spark/ml/classification/NaiveBayes.scala | 4 ++-- .../main/scala/org/apache/spark/ml/clustering/LDA.scala | 6 ++++-- .../spark/ml/regression/AFTSurvivalRegression.scala | 8 +++++--- .../spark/ml/regression/GeneralizedLinearRegression.scala | 4 +++- .../apache/spark/ml/regression/IsotonicRegression.scala | 2 +- .../org/apache/spark/ml/regression/LinearRegression.scala | 3 ++- .../scala/org/apache/spark/ml/util/Instrumentation.scala | 2 +- 8 files changed, 20 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 93ed7affe292b..1b520e88bcd06 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -233,7 +233,8 @@ class MultilayerPerceptronClassifier @Since("1.5.0") ( */ override protected def train(dataset: Dataset[_]): MultilayerPerceptronClassificationModel = { val instr = Instrumentation.create(this, dataset) - instr.logParams(params : _*) + instr.logParams(labelCol, featuresCol, predictionCol, layers, maxIter, + tol, blockSize, solver, stepSize, seed) val myLayers = $(layers) val labels = myLayers.last diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index 70d744c109360..e5713599406e0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -148,7 +148,8 @@ class NaiveBayes @Since("1.5.0") ( } val instr = Instrumentation.create(this, dataset) - instr.logParams(params : _*) + instr.logParams(labelCol, featuresCol, weightCol, predictionCol, rawPredictionCol, + probabilityCol, modelType, smoothing, thresholds) val numFeatures = dataset.select(col($(featuresCol))).head().getAs[Vector](0).size instr.logNumFeatures(numFeatures) @@ -204,7 +205,6 @@ class NaiveBayes @Since("1.5.0") ( val pi = Vectors.dense(piArray) val theta = new DenseMatrix(numLabels, numFeatures, thetaArray, true) val model = new NaiveBayesModel(uid, pi, theta).setOldLabels(labelArray) - instr.logSuccess(model) model } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 41b1c192c798d..2e54f9cff95b2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -890,7 +890,9 @@ class LDA @Since("1.6.0") ( transformSchema(dataset.schema, logging = true) val instr = Instrumentation.create(this, dataset) - instr.logParams(params : _*) + instr.logParams(featuresCol, topicDistributionCol, k, maxIter, subsamplingRate, + checkpointInterval, keepLastCheckpoint, optimizeDocConcentration, topicConcentration, + learningDecay, optimizer, learningOffset, seed) val oldLDA = new OldLDA() .setK($(k)) @@ -911,7 +913,7 @@ class LDA @Since("1.6.0") ( } val model = copyValues(newModel).setParent(this) - instr.logSuccess(newModel) + instr.logSuccess(model) model } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 5344914c1f300..fc02bef02654a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -228,7 +228,8 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val numFeatures = featuresStd.size val instr = Instrumentation.create(this, dataset) - instr.logParams(params : _*) + instr.logParams(labelCol, featuresCol, censorCol, predictionCol, quantilesCol, + fitIntercept, maxIter, tol) instr.logNumFeatures(numFeatures) if (!$(fitIntercept) && (0 until numFeatures).exists { i => @@ -280,9 +281,10 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val coefficients = Vectors.dense(rawCoefficients) val intercept = parameters(1) val scale = math.exp(parameters(0)) - val model = new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale) + val model = copyValues(new AFTSurvivalRegressionModel(uid, coefficients, + intercept, scale).setParent(this)) instr.logSuccess(model) - copyValues(model.setParent(this)) + model } @Since("1.6.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 589cab2b97f72..432b5eefeb786 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -252,7 +252,8 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size val instr = Instrumentation.create(this, dataset) - instr.logParams(params : _*) + instr.logParams(labelCol, featuresCol, weightCol, predictionCol, linkPredictionCol, + family, solver, fitIntercept, link, maxIter, regParam, tol) instr.logNumFeatures(numFeatures) if (numFeatures > WeightedLeastSquares.MAX_NUM_FEATURES) { @@ -278,6 +279,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val .setParent(this)) val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model, wlsModel.diagInvAtWA.toArray, 1, getSolver) + instr.logSuccess(model) return model.setSummary(Some(trainingSummary)) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala index be16cf2f66f9b..1ed9d3c809642 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala @@ -172,7 +172,7 @@ class IsotonicRegression @Since("1.5.0") (@Since("1.5.0") override val uid: Stri if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK) val instr = Instrumentation.create(this, dataset) - instr.logParams(params : _*) + instr.logParams(labelCol, featuresCol, weightCol, predictionCol, featureIndex, isotonic) instr.logNumFeatures(1) val isotonicRegression = new MLlibIsotonicRegression().setIsotonic($(isotonic)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 764c6dad634a6..f49af829b6102 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -205,7 +205,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String } val instr = Instrumentation.create(this, dataset) - instr.logParams(params : _*) + instr.logParams(labelCol, featuresCol, weightCol, predictionCol, solver, tol, + elasticNetParam, fitIntercept, maxIter, regParam, standardization) instr.logNumFeatures(numFeatures) if (($(solver) == "auto" && diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala index 71a626647a5f4..feba56d18b05f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala @@ -92,7 +92,7 @@ private[spark] class Instrumentation[E <: Estimator[_]] private ( } /** - * Logs the successful completion of the training session and the value of the learned model. + * Logs the successful completion of the training session. */ def logSuccess(model: Model[_]): Unit = { log(s"training finished") From d129416e0e6424d3f88cd715a1700747990a9a57 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 2 Nov 2016 11:28:21 +0800 Subject: [PATCH 11/20] select params in gbt and rf --- .../org/apache/spark/ml/classification/GBTClassifier.scala | 5 ++++- .../ml/classification/MultilayerPerceptronClassifier.scala | 2 +- .../org/apache/spark/ml/classification/NaiveBayes.scala | 2 +- .../spark/ml/classification/RandomForestClassifier.scala | 5 ++++- .../src/main/scala/org/apache/spark/ml/clustering/LDA.scala | 4 ++-- .../apache/spark/ml/regression/AFTSurvivalRegression.scala | 2 +- .../scala/org/apache/spark/ml/regression/GBTRegressor.scala | 5 ++++- .../spark/ml/regression/GeneralizedLinearRegression.scala | 2 +- .../org/apache/spark/ml/regression/LinearRegression.scala | 2 +- .../apache/spark/ml/regression/RandomForestRegressor.scala | 5 ++++- 10 files changed, 23 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index c99b63b25d2e7..693a619e1fe52 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -160,7 +160,10 @@ class GBTClassifier @Since("1.4.0") ( val boostingStrategy = super.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Classification) val instr = Instrumentation.create(this, oldDataset) - instr.logParams(params: _*) + instr.logParams(labelCol, featuresCol, predictionCol, impurity, lossType, + maxDepth, maxBins, maxIter, maxMemoryInMB, minInfoGain, + minInstancesPerNode, seed, stepSize, subsamplingRate, + cacheNodeIds, checkpointInterval) instr.logNumFeatures(numFeatures) instr.logNumClasses(2) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 1b520e88bcd06..8b7ab16a16ee0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -234,7 +234,7 @@ class MultilayerPerceptronClassifier @Since("1.5.0") ( override protected def train(dataset: Dataset[_]): MultilayerPerceptronClassificationModel = { val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, predictionCol, layers, maxIter, - tol, blockSize, solver, stepSize, seed) + tol, blockSize, solver, stepSize, seed) val myLayers = $(layers) val labels = myLayers.last diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index e5713599406e0..290acd8a82097 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -149,7 +149,7 @@ class NaiveBayes @Since("1.5.0") ( val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, weightCol, predictionCol, rawPredictionCol, - probabilityCol, modelType, smoothing, thresholds) + probabilityCol, modelType, smoothing, thresholds) val numFeatures = dataset.select(col($(featuresCol))).head().getAs[Vector](0).size instr.logNumFeatures(numFeatures) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 5bbaafeff329f..4ad36d99870b5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -131,7 +131,10 @@ class RandomForestClassifier @Since("1.4.0") ( super.getOldStrategy(categoricalFeatures, numClasses, OldAlgo.Classification, getOldImpurity) val instr = Instrumentation.create(this, oldDataset) - instr.logParams(params: _*) + instr.logParams(labelCol, featuresCol, predictionCol, probabilityCol, rawPredictionCol, + impurity, numTrees, featureSubsetStrategy, maxDepth, maxBins, maxMemoryInMB, + minInfoGain, minInstancesPerNode, seed, subsamplingRate, thresholds, + cacheNodeIds, checkpointInterval) val trees = RandomForest .run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed, Some(instr)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 2e54f9cff95b2..2abbe350e90d1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -891,8 +891,8 @@ class LDA @Since("1.6.0") ( val instr = Instrumentation.create(this, dataset) instr.logParams(featuresCol, topicDistributionCol, k, maxIter, subsamplingRate, - checkpointInterval, keepLastCheckpoint, optimizeDocConcentration, topicConcentration, - learningDecay, optimizer, learningOffset, seed) + checkpointInterval, keepLastCheckpoint, optimizeDocConcentration, + topicConcentration, learningDecay, optimizer, learningOffset, seed) val oldLDA = new OldLDA() .setK($(k)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index fc02bef02654a..fe1bc1e61cc22 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -229,7 +229,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, censorCol, predictionCol, quantilesCol, - fitIntercept, maxIter, tol) + fitIntercept, maxIter, tol) instr.logNumFeatures(numFeatures) if (!$(fitIntercept) && (0 until numFeatures).exists { i => diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index f8ab3d3a45a49..166db7ded2946 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -148,7 +148,10 @@ class GBTRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String) val boostingStrategy = super.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Regression) val instr = Instrumentation.create(this, oldDataset) - instr.logParams(params: _*) + instr.logParams(labelCol, featuresCol, predictionCol, impurity, lossType, + maxDepth, maxBins, maxIter, maxMemoryInMB, minInfoGain, + minInstancesPerNode, seed, stepSize, subsamplingRate, + cacheNodeIds, checkpointInterval) instr.logNumFeatures(numFeatures) val (baseLearners, learnerWeights) = GradientBoostedTrees.run(oldDataset, boostingStrategy, diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 432b5eefeb786..633b20a3648d2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -253,7 +253,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, weightCol, predictionCol, linkPredictionCol, - family, solver, fitIntercept, link, maxIter, regParam, tol) + family, solver, fitIntercept, link, maxIter, regParam, tol) instr.logNumFeatures(numFeatures) if (numFeatures > WeightedLeastSquares.MAX_NUM_FEATURES) { diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index f49af829b6102..b4f42cddf4091 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -206,7 +206,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, weightCol, predictionCol, solver, tol, - elasticNetParam, fitIntercept, maxIter, regParam, standardization) + elasticNetParam, fitIntercept, maxIter, regParam, standardization) instr.logNumFeatures(numFeatures) if (($(solver) == "auto" && diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index ca4a50b825dde..2647240a7c334 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -122,7 +122,10 @@ class RandomForestRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S super.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, getOldImpurity) val instr = Instrumentation.create(this, oldDataset) - instr.logParams(params: _*) + instr.logParams(labelCol, featuresCol, predictionCol, impurity, numTrees, + featureSubsetStrategy, maxDepth, maxBins, maxMemoryInMB, + minInfoGain, minInstancesPerNode, seed, subsamplingRate, + cacheNodeIds, checkpointInterval) val trees = RandomForest .run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed, Some(instr)) From e62a6880518978621cb4c2aa9bd3116c535e77db Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 3 Nov 2016 09:37:02 +0800 Subject: [PATCH 12/20] fix glr,aft,lir --- .../org/apache/spark/ml/regression/AFTSurvivalRegression.scala | 2 +- .../spark/ml/regression/GeneralizedLinearRegression.scala | 2 +- .../org/apache/spark/ml/regression/LinearRegression.scala | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index fe1bc1e61cc22..e0b0ce9aacb32 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -229,7 +229,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, censorCol, predictionCol, quantilesCol, - fitIntercept, maxIter, tol) + fitIntercept, maxIter, tol, aggregationDepth) instr.logNumFeatures(numFeatures) if (!$(fitIntercept) && (0 until numFeatures).exists { i => diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 633b20a3648d2..3af4a475d2801 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -269,7 +269,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val Instance(label, weight, features) } - if (familyObj == Gaussian && linkObj == Identity) { + val model = if (familyObj == Gaussian && linkObj == Identity) { // TODO: Make standardizeFeatures and standardizeLabel configurable. val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = 0.0, standardizeFeatures = true, standardizeLabel = true) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index b4f42cddf4091..38c80256d03f2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -206,7 +206,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, weightCol, predictionCol, solver, tol, - elasticNetParam, fitIntercept, maxIter, regParam, standardization) + elasticNetParam, fitIntercept, maxIter, regParam, standardization, + aggregationDepth) instr.logNumFeatures(numFeatures) if (($(solver) == "auto" && From 19828e1ef331e73c3fd22573f8e06ea8c1d8f28b Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 4 Nov 2016 09:49:33 +0800 Subject: [PATCH 13/20] fix style --- .../org/apache/spark/ml/classification/GBTClassifier.scala | 5 ++--- .../ml/classification/MultilayerPerceptronClassifier.scala | 4 ++-- .../org/apache/spark/ml/classification/NaiveBayes.scala | 2 +- .../spark/ml/classification/RandomForestClassifier.scala | 5 ++--- .../src/main/scala/org/apache/spark/ml/clustering/LDA.scala | 4 ++-- .../apache/spark/ml/regression/AFTSurvivalRegression.scala | 2 +- .../scala/org/apache/spark/ml/regression/GBTRegressor.scala | 5 ++--- .../spark/ml/regression/GeneralizedLinearRegression.scala | 2 +- .../org/apache/spark/ml/regression/LinearRegression.scala | 3 +-- .../apache/spark/ml/regression/RandomForestRegressor.scala | 5 ++--- 10 files changed, 16 insertions(+), 21 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index 693a619e1fe52..bb93ba5d9cc51 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -161,9 +161,8 @@ class GBTClassifier @Since("1.4.0") ( val instr = Instrumentation.create(this, oldDataset) instr.logParams(labelCol, featuresCol, predictionCol, impurity, lossType, - maxDepth, maxBins, maxIter, maxMemoryInMB, minInfoGain, - minInstancesPerNode, seed, stepSize, subsamplingRate, - cacheNodeIds, checkpointInterval) + maxDepth, maxBins, maxIter, maxMemoryInMB, minInfoGain, minInstancesPerNode, + seed, stepSize, subsamplingRate, cacheNodeIds, checkpointInterval) instr.logNumFeatures(numFeatures) instr.logNumClasses(2) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 8b7ab16a16ee0..93cc1e6f09727 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -233,8 +233,8 @@ class MultilayerPerceptronClassifier @Since("1.5.0") ( */ override protected def train(dataset: Dataset[_]): MultilayerPerceptronClassificationModel = { val instr = Instrumentation.create(this, dataset) - instr.logParams(labelCol, featuresCol, predictionCol, layers, maxIter, - tol, blockSize, solver, stepSize, seed) + instr.logParams(labelCol, featuresCol, predictionCol, layers, maxIter, tol, + blockSize, solver, stepSize, seed) val myLayers = $(layers) val labels = myLayers.last diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index 290acd8a82097..e5713599406e0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -149,7 +149,7 @@ class NaiveBayes @Since("1.5.0") ( val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, weightCol, predictionCol, rawPredictionCol, - probabilityCol, modelType, smoothing, thresholds) + probabilityCol, modelType, smoothing, thresholds) val numFeatures = dataset.select(col($(featuresCol))).head().getAs[Vector](0).size instr.logNumFeatures(numFeatures) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 4ad36d99870b5..ce834f1d17e0d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -132,9 +132,8 @@ class RandomForestClassifier @Since("1.4.0") ( val instr = Instrumentation.create(this, oldDataset) instr.logParams(labelCol, featuresCol, predictionCol, probabilityCol, rawPredictionCol, - impurity, numTrees, featureSubsetStrategy, maxDepth, maxBins, maxMemoryInMB, - minInfoGain, minInstancesPerNode, seed, subsamplingRate, thresholds, - cacheNodeIds, checkpointInterval) + impurity, numTrees, featureSubsetStrategy, maxDepth, maxBins, maxMemoryInMB, minInfoGain, + minInstancesPerNode, seed, subsamplingRate, thresholds, cacheNodeIds, checkpointInterval) val trees = RandomForest .run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed, Some(instr)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 2abbe350e90d1..2e54f9cff95b2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -891,8 +891,8 @@ class LDA @Since("1.6.0") ( val instr = Instrumentation.create(this, dataset) instr.logParams(featuresCol, topicDistributionCol, k, maxIter, subsamplingRate, - checkpointInterval, keepLastCheckpoint, optimizeDocConcentration, - topicConcentration, learningDecay, optimizer, learningOffset, seed) + checkpointInterval, keepLastCheckpoint, optimizeDocConcentration, topicConcentration, + learningDecay, optimizer, learningOffset, seed) val oldLDA = new OldLDA() .setK($(k)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index e0b0ce9aacb32..b5bcf47419f2d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -229,7 +229,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, censorCol, predictionCol, quantilesCol, - fitIntercept, maxIter, tol, aggregationDepth) + fitIntercept, maxIter, tol, aggregationDepth) instr.logNumFeatures(numFeatures) if (!$(fitIntercept) && (0 until numFeatures).exists { i => diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index 166db7ded2946..08d175cb94442 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -149,9 +149,8 @@ class GBTRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String) val instr = Instrumentation.create(this, oldDataset) instr.logParams(labelCol, featuresCol, predictionCol, impurity, lossType, - maxDepth, maxBins, maxIter, maxMemoryInMB, minInfoGain, - minInstancesPerNode, seed, stepSize, subsamplingRate, - cacheNodeIds, checkpointInterval) + maxDepth, maxBins, maxIter, maxMemoryInMB, minInfoGain, minInstancesPerNode, + seed, stepSize, subsamplingRate, cacheNodeIds, checkpointInterval) instr.logNumFeatures(numFeatures) val (baseLearners, learnerWeights) = GradientBoostedTrees.run(oldDataset, boostingStrategy, diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 3af4a475d2801..fed5306ae2900 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -253,7 +253,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, weightCol, predictionCol, linkPredictionCol, - family, solver, fitIntercept, link, maxIter, regParam, tol) + family, solver, fitIntercept, link, maxIter, regParam, tol) instr.logNumFeatures(numFeatures) if (numFeatures > WeightedLeastSquares.MAX_NUM_FEATURES) { diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 38c80256d03f2..cf915f6c5d6ee 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -206,8 +206,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, weightCol, predictionCol, solver, tol, - elasticNetParam, fitIntercept, maxIter, regParam, standardization, - aggregationDepth) + elasticNetParam, fitIntercept, maxIter, regParam, standardization, aggregationDepth) instr.logNumFeatures(numFeatures) if (($(solver) == "auto" && diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index 2647240a7c334..2f524a8c5784d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -123,9 +123,8 @@ class RandomForestRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S val instr = Instrumentation.create(this, oldDataset) instr.logParams(labelCol, featuresCol, predictionCol, impurity, numTrees, - featureSubsetStrategy, maxDepth, maxBins, maxMemoryInMB, - minInfoGain, minInstancesPerNode, seed, subsamplingRate, - cacheNodeIds, checkpointInterval) + featureSubsetStrategy, maxDepth, maxBins, maxMemoryInMB, minInfoGain, + minInstancesPerNode, seed, subsamplingRate, cacheNodeIds, checkpointInterval) val trees = RandomForest .run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed, Some(instr)) From 6e273219dc5b106e6759cbe1db8db130f6e10dd1 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 30 Nov 2016 16:46:55 +0800 Subject: [PATCH 14/20] update als --- .../main/scala/org/apache/spark/ml/recommendation/ALS.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index b466e2ed35681..3e852a683672a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -458,9 +458,9 @@ class ALS(@Since("1.4.0") override val uid: String) extends Estimator[ALSModel] Rating(row.getInt(0), row.getInt(1), row.getFloat(2)) } val instrLog = Instrumentation.create(this, ratings) - instrLog.logParams(rank, numUserBlocks, numItemBlocks, implicitPrefs, alpha, - userCol, itemCol, ratingCol, predictionCol, maxIter, - regParam, nonnegative, checkpointInterval, seed) + instrLog.logParams(rank, numUserBlocks, numItemBlocks, implicitPrefs, alpha, userCol, + itemCol, ratingCol, predictionCol, maxIter, regParam, nonnegative, checkpointInterval, + seed, intermediateStorageLevel, finalStorageLevel) val (userFactors, itemFactors) = ALS.train(ratings, rank = $(rank), numUserBlocks = $(numUserBlocks), numItemBlocks = $(numItemBlocks), maxIter = $(maxIter), regParam = $(regParam), implicitPrefs = $(implicitPrefs), From 6d48759a0edfce7286549fc544b564a4f9ce419c Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 30 Nov 2016 17:09:50 +0800 Subject: [PATCH 15/20] fix conflict --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index fed5306ae2900..432b5eefeb786 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -269,7 +269,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val Instance(label, weight, features) } - val model = if (familyObj == Gaussian && linkObj == Identity) { + if (familyObj == Gaussian && linkObj == Identity) { // TODO: Make standardizeFeatures and standardizeLabel configurable. val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = 0.0, standardizeFeatures = true, standardizeLabel = true) From c172efca17bc50b2c51dcf98b4ceebfdfa178347 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 30 Dec 2016 16:14:02 +0800 Subject: [PATCH 16/20] fix glr --- .../GeneralizedLinearRegression.scala | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 432b5eefeb786..35891403f69ce 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -269,7 +269,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val Instance(label, weight, features) } - if (familyObj == Gaussian && linkObj == Identity) { + val model = if (familyObj == Gaussian && linkObj == Identity) { // TODO: Make standardizeFeatures and standardizeLabel configurable. val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = 0.0, standardizeFeatures = true, standardizeLabel = true) @@ -279,23 +279,23 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val .setParent(this)) val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model, wlsModel.diagInvAtWA.toArray, 1, getSolver) - instr.logSuccess(model) - return model.setSummary(Some(trainingSummary)) + model.setSummary(Some(trainingSummary)) + } else { + // Fit Generalized Linear Model by iteratively reweighted least squares (IRLS). + val initialModel = familyAndLink.initialize(instances, $(fitIntercept), $(regParam)) + val optimizer = new IterativelyReweightedLeastSquares(initialModel, familyAndLink.reweightFunc, + $(fitIntercept), $(regParam), $(maxIter), $(tol)) + val irlsModel = optimizer.fit(instances) + val model = copyValues( + new GeneralizedLinearRegressionModel(uid, irlsModel.coefficients, irlsModel.intercept) + .setParent(this)) + val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model, + irlsModel.diagInvAtWA.toArray, irlsModel.numIterations, getSolver) + model.setSummary(Some(trainingSummary)) } - // Fit Generalized Linear Model by iteratively reweighted least squares (IRLS). - val initialModel = familyAndLink.initialize(instances, $(fitIntercept), $(regParam)) - val optimizer = new IterativelyReweightedLeastSquares(initialModel, familyAndLink.reweightFunc, - $(fitIntercept), $(regParam), $(maxIter), $(tol)) - val irlsModel = optimizer.fit(instances) - - val model = copyValues( - new GeneralizedLinearRegressionModel(uid, irlsModel.coefficients, irlsModel.intercept) - .setParent(this)) - val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model, - irlsModel.diagInvAtWA.toArray, irlsModel.numIterations, getSolver) instr.logSuccess(model) - model.setSummary(Some(trainingSummary)) + model } @Since("2.0.0") From 00b91fd644ca86f784faf540bc222d9588cf7abd Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 30 Dec 2016 16:21:09 +0800 Subject: [PATCH 17/20] fix style --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 35891403f69ce..a32302bf5dfc8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -283,8 +283,8 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val } else { // Fit Generalized Linear Model by iteratively reweighted least squares (IRLS). val initialModel = familyAndLink.initialize(instances, $(fitIntercept), $(regParam)) - val optimizer = new IterativelyReweightedLeastSquares(initialModel, familyAndLink.reweightFunc, - $(fitIntercept), $(regParam), $(maxIter), $(tol)) + val optimizer = new IterativelyReweightedLeastSquares(initialModel, + familyAndLink.reweightFunc, $(fitIntercept), $(regParam), $(maxIter), $(tol)) val irlsModel = optimizer.fit(instances) val model = copyValues( new GeneralizedLinearRegressionModel(uid, irlsModel.coefficients, irlsModel.intercept) From c8693d870373978b4a43b2215a1b487215107d45 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 5 Jan 2017 12:43:39 +0800 Subject: [PATCH 18/20] update --- .../scala/org/apache/spark/ml/clustering/LDA.scala | 3 ++- .../spark/ml/regression/AFTSurvivalRegression.scala | 2 +- .../spark/ml/regression/LinearRegression.scala | 12 +++++++++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 2e54f9cff95b2..e8b6c4105055d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -892,7 +892,7 @@ class LDA @Since("1.6.0") ( val instr = Instrumentation.create(this, dataset) instr.logParams(featuresCol, topicDistributionCol, k, maxIter, subsamplingRate, checkpointInterval, keepLastCheckpoint, optimizeDocConcentration, topicConcentration, - learningDecay, optimizer, learningOffset, seed) + docConcentration, learningDecay, optimizer, learningOffset, seed) val oldLDA = new OldLDA() .setK($(k)) @@ -912,6 +912,7 @@ class LDA @Since("1.6.0") ( new DistributedLDAModel(uid, m.vocabSize, m, dataset.sparkSession, None) } + instr.logNumFeatures(newModel.vocabSize) val model = copyValues(newModel).setParent(this) instr.logSuccess(model) model diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index b5bcf47419f2d..537f5ba3d4560 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -229,7 +229,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, censorCol, predictionCol, quantilesCol, - fitIntercept, maxIter, tol, aggregationDepth) + quantileProbabilities, fitIntercept, maxIter, tol, aggregationDepth) instr.logNumFeatures(numFeatures) if (!$(fitIntercept) && (0 until numFeatures).exists { i => diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index cf915f6c5d6ee..2de7e81d8d41e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -230,8 +230,10 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String summaryModel, model.diagInvAtWA.toArray, model.objectiveHistory) + + lrModel.setSummary(Some(trainingSummary)) instr.logSuccess(lrModel) - return lrModel.setSummary(Some(trainingSummary)) + return lrModel } val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE @@ -284,8 +286,10 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String model, Array(0D), Array(0D)) + + model.setSummary(Some(trainingSummary)) instr.logSuccess(model) - return model.setSummary(Some(trainingSummary)) + return model } else { require($(regParam) == 0.0, "The standard deviation of the label is zero. " + "Model cannot be regularized.") @@ -407,8 +411,10 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String model, Array(0D), objectiveHistory) - instr.logSuccess(model) + model.setSummary(Some(trainingSummary)) + instr.logSuccess(model) + model } @Since("1.4.0") From 7f00d3675e18b8c0ea485eeb73eb5f7e12635bc5 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Fri, 13 Jan 2017 11:18:24 +0800 Subject: [PATCH 19/20] log quantileProbabilities.size --- .../org/apache/spark/ml/regression/AFTSurvivalRegression.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 537f5ba3d4560..2f78dd30b3af7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -229,7 +229,8 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val instr = Instrumentation.create(this, dataset) instr.logParams(labelCol, featuresCol, censorCol, predictionCol, quantilesCol, - quantileProbabilities, fitIntercept, maxIter, tol, aggregationDepth) + fitIntercept, maxIter, tol, aggregationDepth) + instr.logNamedValue("quantileProbabilities.size", $(quantileProbabilities).length) instr.logNumFeatures(numFeatures) if (!$(fitIntercept) && (0 until numFeatures).exists { i => From c8188b03c49912ab2ee9f7dc0f5aae5a9ddc1a1c Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Fri, 13 Jan 2017 11:20:26 +0800 Subject: [PATCH 20/20] del docConcentration --- mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index ac45117529efa..03f4ac5b28e90 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -892,7 +892,7 @@ class LDA @Since("1.6.0") ( val instr = Instrumentation.create(this, dataset) instr.logParams(featuresCol, topicDistributionCol, k, maxIter, subsamplingRate, checkpointInterval, keepLastCheckpoint, optimizeDocConcentration, topicConcentration, - docConcentration, learningDecay, optimizer, learningOffset, seed) + learningDecay, optimizer, learningOffset, seed) val oldLDA = new OldLDA() .setK($(k))