diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 8a5101533aa57..5a0773374fc5e 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -226,7 +226,9 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
val accumulableHeaders: Seq[String] = Seq("Accumulable", "Value")
def accumulableRow(acc: AccumulableInfo): Elem =
' +
'Launch Time: ${UIUtils.formatDate(new Date(launchTime))}' +
'${
- if (!isRunning) {
+ if (!taskInfo.running) {
s"""
Finish Time: ${UIUtils.formatDate(new Date(finishTime))}"""
} else {
""
}
}' +
- '
Scheduler Delay: ${schedulerDelay} ms' +
+ '
Scheduler Delay: $schedulerDelay ms' +
'
Task Deserialization Time: ${UIUtils.formatDuration(deserializationTime)}' +
'
Shuffle Read Time: ${UIUtils.formatDuration(shuffleReadTime)}' +
'
Executor Computing Time: ${UIUtils.formatDuration(executorComputingTime)}' +
@@ -609,28 +609,28 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
'
Getting Result Time: ${UIUtils.formatDuration(gettingResultTime)}">' +
'
' +
' ' +
+ 'x="$schedulerDelayProportionPos%" y="0px" height="26px"' +
+ 'width="$schedulerDelayProportion%"">' +
' ' +
+ 'x="$deserializationTimeProportionPos%" y="0px" height="26px"' +
+ 'width="$deserializationTimeProportion%">' +
' ' +
+ 'x="$shuffleReadTimeProportionPos%" y="0px" height="26px"' +
+ 'width="$shuffleReadTimeProportion%">' +
' ' +
+ 'x="$executorRuntimeProportionPos%" y="0px" height="26px"' +
+ 'width="$executorComputingTimeProportion%">' +
' ' +
+ 'x="$shuffleWriteTimeProportionPos%" y="0px" height="26px"' +
+ 'width="$shuffleWriteTimeProportion%">' +
' ' +
+ 'x="$serializationTimeProportionPos%" y="0px" height="26px"' +
+ 'width="$serializationTimeProportion%">' +
' ',
- 'start': new Date(${launchTime}),
- 'end': new Date(${finishTime})
+ 'x="$gettingResultTimeProportionPos%" y="0px" height="26px"' +
+ 'width="$gettingResultTimeProportion%">',
+ 'start': new Date($launchTime),
+ 'end': new Date($finishTime)
}
"""
timelineObject
@@ -640,8 +640,8 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
case (executorId, host) =>
s"""
{
- 'id': '${executorId}',
- 'content': '${executorId} / ${host}',
+ 'id': '$executorId',
+ 'content': '$executorId / $host',
}
"""
}.mkString("[", ",", "]")
@@ -649,14 +649,20 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
val maxZoom = maxFinishTime - minLaunchTime
- Event Timeline {
- if (MAX_TIMELINE_TASKS <= numEffectiveTasks) {
- s"(Most recent ${MAX_TIMELINE_TASKS})"
- }
- }
-
+ Event Timeline
++
+ {
+ if (MAX_TIMELINE_TASKS < numEffectiveTasks) {
+
+ This stage has more than the maximum number of tasks that can be shown in the
+ visualization! Only the first {MAX_TIMELINE_TASKS} tasks
+ (of {numEffectiveTasks} total) are shown.
+
+ } else {
+ Seq.empty
+ }
+ }
@@ -667,7 +673,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
++
}
diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index 3dbb35f7054a2..af4f00054997c 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -118,14 +118,14 @@ if [[ ! "$@" =~ --skip-publish ]]; then
rm -rf $SPARK_REPO
- build/mvn -DskipTests -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
- -Pyarn -Phive -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
+ build/mvn -DskipTests -Pyarn -Phive \
+ -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
clean install
./dev/change-version-to-2.11.sh
- build/mvn -DskipTests -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
- -Dscala-2.11 -Pyarn -Phive -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
+ build/mvn -DskipTests -Pyarn -Phive \
+ -Dscala-2.11 -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
clean install
./dev/change-version-to-2.10.sh
@@ -228,9 +228,9 @@ if [[ ! "$@" =~ --skip-package ]]; then
# We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds
# share the same Zinc server.
- make_binary_release "hadoop1" "-Phive -Phive-thriftserver -Dhadoop.version=1.0.4" "3030" &
- make_binary_release "hadoop1-scala2.11" "-Phive -Dscala-2.11" "3031" &
- make_binary_release "cdh4" "-Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" &
+ make_binary_release "hadoop1" "-Phadoop-1 -Phive -Phive-thriftserver" "3030" &
+ make_binary_release "hadoop1-scala2.11" "-Phadoop-1 -Phive -Dscala-2.11" "3031" &
+ make_binary_release "cdh4" "-Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" &
make_binary_release "hadoop2.3" "-Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" &
make_binary_release "hadoop2.4" "-Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" &
make_binary_release "mapr3" "-Pmapr3 -Phive -Phive-thriftserver" "3035" &
diff --git a/dev/run-tests b/dev/run-tests
index ef587a1a5988c..44d802782c4a4 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -40,11 +40,11 @@ function handle_error () {
{
if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then
- export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=1.0.4"
+ export SBT_MAVEN_PROFILES_ARGS="-Phadoop-1 -Dhadoop.version=1.0.4"
elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then
- export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=2.0.0-mr1-cdh4.1.1"
+ export SBT_MAVEN_PROFILES_ARGS="-Phadoop-1 -Dhadoop.version=2.0.0-mr1-cdh4.1.1"
elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then
- export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0"
+ export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.2"
elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.3" ]; then
export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
fi
diff --git a/dev/scalastyle b/dev/scalastyle
index 4e03f89ed5d5d..7f014c82f14c6 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -20,8 +20,8 @@
echo -e "q\n" | build/sbt -Phive -Phive-thriftserver scalastyle > scalastyle.txt
echo -e "q\n" | build/sbt -Phive -Phive-thriftserver test:scalastyle >> scalastyle.txt
# Check style with YARN built too
-echo -e "q\n" | build/sbt -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 scalastyle >> scalastyle.txt
-echo -e "q\n" | build/sbt -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 test:scalastyle >> scalastyle.txt
+echo -e "q\n" | build/sbt -Pyarn -Phadoop-2.2 scalastyle >> scalastyle.txt
+echo -e "q\n" | build/sbt -Pyarn -Phadoop-2.2 test:scalastyle >> scalastyle.txt
ERRORS=$(cat scalastyle.txt | awk '{if($1~/error/)print}')
rm scalastyle.txt
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 287fcd3c4034f..6e310ff424784 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -59,14 +59,14 @@ You can fix this by setting the `MAVEN_OPTS` variable as discussed before.
# Specifying the Hadoop Version
-Because HDFS is not protocol-compatible across versions, if you want to read from HDFS, you'll need to build Spark against the specific HDFS version in your environment. You can do this through the "hadoop.version" property. If unset, Spark will build against Hadoop 1.0.4 by default. Note that certain build profiles are required for particular Hadoop versions:
+Because HDFS is not protocol-compatible across versions, if you want to read from HDFS, you'll need to build Spark against the specific HDFS version in your environment. You can do this through the "hadoop.version" property. If unset, Spark will build against Hadoop 2.2.0 by default. Note that certain build profiles are required for particular Hadoop versions:
Hadoop version Profile required
- 1.x to 2.1.x (none)
+ 1.x to 2.1.x hadoop-1
2.2.x hadoop-2.2
2.3.x hadoop-2.3
2.4.x hadoop-2.4
@@ -77,10 +77,10 @@ For Apache Hadoop versions 1.x, Cloudera CDH "mr1" distributions, and other Hado
{% highlight bash %}
# Apache Hadoop 1.2.1
-mvn -Dhadoop.version=1.2.1 -DskipTests clean package
+mvn -Dhadoop.version=1.2.1 -Phadoop-1 -DskipTests clean package
# Cloudera CDH 4.2.0 with MapReduce v1
-mvn -Dhadoop.version=2.0.0-mr1-cdh4.2.0 -DskipTests clean package
+mvn -Dhadoop.version=2.0.0-mr1-cdh4.2.0 -Phadoop-1 -DskipTests clean package
{% endhighlight %}
You can enable the "yarn" profile and optionally set the "yarn.version" property if it is different from "hadoop.version". Spark only supports YARN versions 2.2.0 and later.
@@ -88,8 +88,9 @@ You can enable the "yarn" profile and optionally set the "yarn.version" property
Examples:
{% highlight bash %}
+
# Apache Hadoop 2.2.X
-mvn -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 -DskipTests clean package
+mvn -Pyarn -Phadoop-2.2 -DskipTests clean package
# Apache Hadoop 2.3.X
mvn -Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0 -DskipTests clean package
diff --git a/docs/hadoop-third-party-distributions.md b/docs/hadoop-third-party-distributions.md
index 96bd69ca3b33b..795dd82a6be06 100644
--- a/docs/hadoop-third-party-distributions.md
+++ b/docs/hadoop-third-party-distributions.md
@@ -14,7 +14,7 @@ property. For certain versions, you will need to specify additional profiles. Fo
see the guide on [building with maven](building-spark.html#specifying-the-hadoop-version):
mvn -Dhadoop.version=1.0.4 -DskipTests clean package
- mvn -Phadoop-2.2 -Dhadoop.version=2.2.0 -DskipTests clean package
+ mvn -Phadoop-2.3 -Dhadoop.version=2.3.0 -DskipTests clean package
The table below lists the corresponding `hadoop.version` code for each CDH/HDP release. Note that
some Hadoop releases are binary compatible across client versions. This means the pre-built Spark
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
index eac4f898a475d..ec533d174ebdc 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
@@ -28,6 +28,7 @@
import org.apache.spark.ml.classification.ClassificationModel;
import org.apache.spark.ml.param.IntParam;
import org.apache.spark.ml.param.ParamMap;
+import org.apache.spark.ml.util.Identifiable$;
import org.apache.spark.mllib.linalg.BLAS;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
@@ -103,7 +104,23 @@ public static void main(String[] args) throws Exception {
* However, this should still compile and run successfully.
*/
class MyJavaLogisticRegression
- extends Classifier {
+ extends Classifier {
+
+ public MyJavaLogisticRegression() {
+ init();
+ }
+
+ public MyJavaLogisticRegression(String uid) {
+ this.uid_ = uid;
+ init();
+ }
+
+ private String uid_ = Identifiable$.MODULE$.randomUID("myJavaLogReg");
+
+ @Override
+ public String uid() {
+ return uid_;
+ }
/**
* Param for max number of iterations
@@ -117,7 +134,7 @@ class MyJavaLogisticRegression
int getMaxIter() { return (Integer) getOrDefault(maxIter); }
- public MyJavaLogisticRegression() {
+ private void init() {
setMaxIter(100);
}
@@ -137,7 +154,7 @@ public MyJavaLogisticRegressionModel train(DataFrame dataset) {
Vector weights = Vectors.zeros(numFeatures); // Learning would happen here.
// Create a model, and return it.
- return new MyJavaLogisticRegressionModel(this, weights);
+ return new MyJavaLogisticRegressionModel(uid(), weights).setParent(this);
}
}
@@ -149,17 +166,21 @@ public MyJavaLogisticRegressionModel train(DataFrame dataset) {
* However, this should still compile and run successfully.
*/
class MyJavaLogisticRegressionModel
- extends ClassificationModel {
-
- private MyJavaLogisticRegression parent_;
- public MyJavaLogisticRegression parent() { return parent_; }
+ extends ClassificationModel {
private Vector weights_;
public Vector weights() { return weights_; }
- public MyJavaLogisticRegressionModel(MyJavaLogisticRegression parent_, Vector weights_) {
- this.parent_ = parent_;
- this.weights_ = weights_;
+ public MyJavaLogisticRegressionModel(String uid, Vector weights) {
+ this.uid_ = uid;
+ this.weights_ = weights;
+ }
+
+ private String uid_ = Identifiable$.MODULE$.randomUID("myJavaLogReg");
+
+ @Override
+ public String uid() {
+ return uid_;
}
// This uses the default implementation of transform(), which reads column "features" and outputs
@@ -204,6 +225,6 @@ public Vector predictRaw(Vector features) {
*/
@Override
public MyJavaLogisticRegressionModel copy(ParamMap extra) {
- return copyValues(new MyJavaLogisticRegressionModel(parent_, weights_), extra);
+ return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), extra);
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
index ef1ec103a879f..54738813d0016 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -66,7 +66,7 @@ public static void main(String[] args) {
.setOutputCol("features");
LogisticRegression lr = new LogisticRegression()
.setMaxIter(10)
- .setRegParam(0.01);
+ .setRegParam(0.001);
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
@@ -77,7 +77,7 @@ public static void main(String[] args) {
List localTest = Lists.newArrayList(
new Document(4L, "spark i j k"),
new Document(5L, "l m n"),
- new Document(6L, "mapreduce spark"),
+ new Document(6L, "spark hadoop spark"),
new Document(7L, "apache hadoop"));
DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py
index fab21f003b233..b4f06bf888746 100644
--- a/examples/src/main/python/ml/simple_text_classification_pipeline.py
+++ b/examples/src/main/python/ml/simple_text_classification_pipeline.py
@@ -48,7 +48,7 @@
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
- lr = LogisticRegression(maxIter=10, regParam=0.01)
+ lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training documents.
@@ -58,7 +58,7 @@
Document = Row("id", "text")
test = sc.parallelize([(4, "spark i j k"),
(5, "l m n"),
- (6, "mapreduce spark"),
+ (6, "spark hadoop spark"),
(7, "apache hadoop")]) \
.map(lambda x: Document(*x)).toDF()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index 2a2d0677272a0..3ee456edbe01e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -20,6 +20,7 @@ package org.apache.spark.examples.ml
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.classification.{ClassificationModel, Classifier, ClassifierParams}
import org.apache.spark.ml.param.{IntParam, ParamMap}
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
@@ -106,10 +107,12 @@ private trait MyLogisticRegressionParams extends ClassifierParams {
*
* NOTE: This is private since it is an example. In practice, you may not want it to be private.
*/
-private class MyLogisticRegression
+private class MyLogisticRegression(override val uid: String)
extends Classifier[Vector, MyLogisticRegression, MyLogisticRegressionModel]
with MyLogisticRegressionParams {
+ def this() = this(Identifiable.randomUID("myLogReg"))
+
setMaxIter(100) // Initialize
// The parameter setter is in this class since it should return type MyLogisticRegression.
@@ -125,7 +128,7 @@ private class MyLogisticRegression
val weights = Vectors.zeros(numFeatures) // Learning would happen here.
// Create a model, and return it.
- new MyLogisticRegressionModel(this, weights)
+ new MyLogisticRegressionModel(uid, weights).setParent(this)
}
}
@@ -135,7 +138,7 @@ private class MyLogisticRegression
* NOTE: This is private since it is an example. In practice, you may not want it to be private.
*/
private class MyLogisticRegressionModel(
- override val parent: MyLogisticRegression,
+ override val uid: String,
val weights: Vector)
extends ClassificationModel[Vector, MyLogisticRegressionModel]
with MyLogisticRegressionParams {
@@ -173,6 +176,6 @@ private class MyLogisticRegressionModel(
* This is used for the default implementation of [[transform()]].
*/
override def copy(extra: ParamMap): MyLogisticRegressionModel = {
- copyValues(new MyLogisticRegressionModel(parent, weights), extra)
+ copyValues(new MyLogisticRegressionModel(uid, weights), extra)
}
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
index 6772efd2c581c..1324b066c30c3 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -64,7 +64,7 @@ object SimpleTextClassificationPipeline {
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
- .setRegParam(0.01)
+ .setRegParam(0.001)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, lr))
@@ -75,7 +75,7 @@ object SimpleTextClassificationPipeline {
val test = sc.parallelize(Seq(
Document(4L, "spark i j k"),
Document(5L, "l m n"),
- Document(6L, "mapreduce spark"),
+ Document(6L, "spark hadoop spark"),
Document(7L, "apache hadoop")))
// Make predictions on test documents.
diff --git a/make-distribution.sh b/make-distribution.sh
index 1bfa9acb1fe6e..8d6e91d67593f 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -58,7 +58,7 @@ while (( "$#" )); do
--hadoop)
echo "Error: '--hadoop' is no longer supported:"
echo "Error: use Maven profiles and options -Dhadoop.version and -Dyarn.version instead."
- echo "Error: Related profiles include hadoop-2.2, hadoop-2.3 and hadoop-2.4."
+ echo "Error: Related profiles include hadoop-1, hadoop-2.2, hadoop-2.3 and hadoop-2.4."
exit_with_usage
;;
--with-yarn)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Model.scala b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
index 9974efe7b1d25..7fd515369b19b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Model.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
@@ -32,7 +32,15 @@ abstract class Model[M <: Model[M]] extends Transformer {
* The parent estimator that produced this model.
* Note: For ensembles' component Models, this value can be null.
*/
- val parent: Estimator[M]
+ var parent: Estimator[M] = _
+
+ /**
+ * Sets the parent of this model (Java API).
+ */
+ def setParent(parent: Estimator[M]): M = {
+ this.parent = parent
+ this.asInstanceOf[M]
+ }
override def copy(extra: ParamMap): M = {
// The default implementation of Params.copy doesn't work for models.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 33d430f5671ee..fac54188f9f4e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -22,6 +22,7 @@ import scala.collection.mutable.ListBuffer
import org.apache.spark.Logging
import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
import org.apache.spark.ml.param.{Param, ParamMap, Params}
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType
@@ -80,7 +81,9 @@ abstract class PipelineStage extends Params with Logging {
* an identity transformer.
*/
@AlphaComponent
-class Pipeline extends Estimator[PipelineModel] {
+class Pipeline(override val uid: String) extends Estimator[PipelineModel] {
+
+ def this() = this(Identifiable.randomUID("pipeline"))
/**
* param for pipeline stages
@@ -148,7 +151,7 @@ class Pipeline extends Estimator[PipelineModel] {
}
}
- new PipelineModel(this, transformers.toArray)
+ new PipelineModel(uid, transformers.toArray).setParent(this)
}
override def copy(extra: ParamMap): Pipeline = {
@@ -171,7 +174,7 @@ class Pipeline extends Estimator[PipelineModel] {
*/
@AlphaComponent
class PipelineModel private[ml] (
- override val parent: Pipeline,
+ override val uid: String,
val stages: Array[Transformer])
extends Model[PipelineModel] with Logging {
@@ -190,6 +193,6 @@ class PipelineModel private[ml] (
}
override def copy(extra: ParamMap): PipelineModel = {
- new PipelineModel(parent, stages)
+ new PipelineModel(uid, stages)
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
index f6a5f27425d1f..ec0f76aa668bd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -88,7 +88,7 @@ abstract class Predictor[
// This handles a few items such as schema validation.
// Developers only need to implement train().
transformSchema(dataset.schema, logging = true)
- copyValues(train(dataset))
+ copyValues(train(dataset).setParent(this))
}
override def copy(extra: ParamMap): Learner = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index dcebea1d4b015..7c961332bf5b6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tree.{TreeClassifierParams, DecisionTreeParams, DecisionTreeModel, Node}
-import org.apache.spark.ml.util.MetadataUtils
+import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree}
@@ -39,10 +39,12 @@ import org.apache.spark.sql.DataFrame
* features.
*/
@AlphaComponent
-final class DecisionTreeClassifier
+final class DecisionTreeClassifier(override val uid: String)
extends Predictor[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel]
with DecisionTreeParams with TreeClassifierParams {
+ def this() = this(Identifiable.randomUID("dtc"))
+
// Override parameter setters from parent trait for Java API compatibility.
override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
@@ -101,7 +103,7 @@ object DecisionTreeClassifier {
*/
@AlphaComponent
final class DecisionTreeClassificationModel private[ml] (
- override val parent: DecisionTreeClassifier,
+ override val uid: String,
override val rootNode: Node)
extends PredictionModel[Vector, DecisionTreeClassificationModel]
with DecisionTreeModel with Serializable {
@@ -114,7 +116,7 @@ final class DecisionTreeClassificationModel private[ml] (
}
override def copy(extra: ParamMap): DecisionTreeClassificationModel = {
- copyValues(new DecisionTreeClassificationModel(parent, rootNode), extra)
+ copyValues(new DecisionTreeClassificationModel(uid, rootNode), extra)
}
override def toString: String = {
@@ -138,6 +140,7 @@ private[ml] object DecisionTreeClassificationModel {
s"Cannot convert non-classification DecisionTreeModel (old API) to" +
s" DecisionTreeClassificationModel (new API). Algo is: ${oldModel.algo}")
val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures)
- new DecisionTreeClassificationModel(parent, rootNode)
+ val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtc")
+ new DecisionTreeClassificationModel(uid, rootNode)
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index ae51b05a0c42d..d504d84beb91e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -25,7 +25,7 @@ import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.tree.{GBTParams, TreeClassifierParams, DecisionTreeModel, TreeEnsembleModel}
-import org.apache.spark.ml.util.MetadataUtils
+import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{GradientBoostedTrees => OldGBT}
@@ -44,10 +44,12 @@ import org.apache.spark.sql.DataFrame
* Note: Multiclass labels are not currently supported.
*/
@AlphaComponent
-final class GBTClassifier
+final class GBTClassifier(override val uid: String)
extends Predictor[Vector, GBTClassifier, GBTClassificationModel]
with GBTParams with TreeClassifierParams with Logging {
+ def this() = this(Identifiable.randomUID("gbtc"))
+
// Override parameter setters from parent trait for Java API compatibility.
// Parameters from TreeClassifierParams:
@@ -160,7 +162,7 @@ object GBTClassifier {
*/
@AlphaComponent
final class GBTClassificationModel(
- override val parent: GBTClassifier,
+ override val uid: String,
private val _trees: Array[DecisionTreeRegressionModel],
private val _treeWeights: Array[Double])
extends PredictionModel[Vector, GBTClassificationModel]
@@ -184,7 +186,7 @@ final class GBTClassificationModel(
}
override def copy(extra: ParamMap): GBTClassificationModel = {
- copyValues(new GBTClassificationModel(parent, _trees, _treeWeights), extra)
+ copyValues(new GBTClassificationModel(uid, _trees, _treeWeights), extra)
}
override def toString: String = {
@@ -210,6 +212,7 @@ private[ml] object GBTClassificationModel {
// parent, fittingParamMap for each tree is null since there are no good ways to set these.
DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
}
- new GBTClassificationModel(parent, newTrees, oldModel.treeWeights)
+ val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtc")
+ new GBTClassificationModel(parent.uid, newTrees, oldModel.treeWeights)
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 2b103626873a9..8694c96e4c5b6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -26,6 +26,7 @@ import breeze.optimize.{CachedDiffFunction, DiffFunction}
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.BLAS._
import org.apache.spark.mllib.regression.LabeledPoint
@@ -50,10 +51,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
* Currently, this class only supports binary classification.
*/
@AlphaComponent
-class LogisticRegression
+class LogisticRegression(override val uid: String)
extends ProbabilisticClassifier[Vector, LogisticRegression, LogisticRegressionModel]
with LogisticRegressionParams with Logging {
+ def this() = this(Identifiable.randomUID("logreg"))
+
/**
* Set the regularization parameter.
* Default is 0.0.
@@ -213,7 +216,7 @@ class LogisticRegression
(weightsWithIntercept, 0.0)
}
- new LogisticRegressionModel(this, weights.compressed, intercept)
+ new LogisticRegressionModel(uid, weights.compressed, intercept)
}
}
@@ -224,7 +227,7 @@ class LogisticRegression
*/
@AlphaComponent
class LogisticRegressionModel private[ml] (
- override val parent: LogisticRegression,
+ override val uid: String,
val weights: Vector,
val intercept: Double)
extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel]
@@ -276,7 +279,7 @@ class LogisticRegressionModel private[ml] (
}
override def copy(extra: ParamMap): LogisticRegressionModel = {
- copyValues(new LogisticRegressionModel(parent, weights, intercept), extra)
+ copyValues(new LogisticRegressionModel(uid, weights, intercept), extra)
}
override protected def raw2prediction(rawPrediction: Vector): Double = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index afb8d75d57384..1543f051ccd17 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -25,7 +25,7 @@ import org.apache.spark.annotation.{AlphaComponent, Experimental}
import org.apache.spark.ml._
import org.apache.spark.ml.attribute._
import org.apache.spark.ml.param.Param
-import org.apache.spark.ml.util.MetadataUtils
+import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions._
@@ -40,19 +40,17 @@ private[ml] trait OneVsRestParams extends PredictorParams {
type ClassifierType = Classifier[F, E, M] forSome {
type F
type M <: ClassificationModel[F, M]
- type E <: Classifier[F, E, M]
+ type E <: Classifier[F, E, M]
}
/**
* param for the base binary classifier that we reduce multiclass classification into.
* @group param
*/
- val classifier: Param[ClassifierType] =
- new Param(this, "classifier", "base binary classifier ")
+ val classifier: Param[ClassifierType] = new Param(this, "classifier", "base binary classifier")
/** @group getParam */
def getClassifier: ClassifierType = $(classifier)
-
}
/**
@@ -70,10 +68,10 @@ private[ml] trait OneVsRestParams extends PredictorParams {
* (taking label 0).
*/
@AlphaComponent
-class OneVsRestModel private[ml] (
- override val parent: OneVsRest,
- labelMetadata: Metadata,
- val models: Array[_ <: ClassificationModel[_,_]])
+final class OneVsRestModel private[ml] (
+ override val uid: String,
+ labelMetadata: Metadata,
+ val models: Array[_ <: ClassificationModel[_,_]])
extends Model[OneVsRestModel] with OneVsRestParams {
override def transformSchema(schema: StructType): StructType = {
@@ -145,11 +143,13 @@ class OneVsRestModel private[ml] (
* is picked to label the example.
*/
@Experimental
-final class OneVsRest extends Estimator[OneVsRestModel] with OneVsRestParams {
+final class OneVsRest(override val uid: String)
+ extends Estimator[OneVsRestModel] with OneVsRestParams {
+
+ def this() = this(Identifiable.randomUID("oneVsRest"))
/** @group setParam */
- def setClassifier(value: Classifier[_,_,_]): this.type = {
- // TODO: Find a better way to do this. Existential Types don't work with Java API so cast needed
+ def setClassifier(value: Classifier[_, _, _]): this.type = {
set(classifier, value.asInstanceOf[ClassifierType])
}
@@ -204,6 +204,7 @@ final class OneVsRest extends Estimator[OneVsRestModel] with OneVsRestParams {
NominalAttribute.defaultAttr.withName("label").withNumValues(numClasses)
case attr: Attribute => attr
}
- copyValues(new OneVsRestModel(this, labelAttribute.toMetadata(), models))
+ val model = new OneVsRestModel(uid, labelAttribute.toMetadata(), models).setParent(this)
+ copyValues(model)
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 9954893f14359..a1de7919859eb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -23,7 +23,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tree.{RandomForestParams, TreeClassifierParams, DecisionTreeModel, TreeEnsembleModel}
-import org.apache.spark.ml.util.MetadataUtils
+import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{RandomForest => OldRandomForest}
@@ -41,10 +41,12 @@ import org.apache.spark.sql.DataFrame
* features.
*/
@AlphaComponent
-final class RandomForestClassifier
+final class RandomForestClassifier(override val uid: String)
extends Predictor[Vector, RandomForestClassifier, RandomForestClassificationModel]
with RandomForestParams with TreeClassifierParams {
+ def this() = this(Identifiable.randomUID("rfc"))
+
// Override parameter setters from parent trait for Java API compatibility.
// Parameters from TreeClassifierParams:
@@ -118,7 +120,7 @@ object RandomForestClassifier {
*/
@AlphaComponent
final class RandomForestClassificationModel private[ml] (
- override val parent: RandomForestClassifier,
+ override val uid: String,
private val _trees: Array[DecisionTreeClassificationModel])
extends PredictionModel[Vector, RandomForestClassificationModel]
with TreeEnsembleModel with Serializable {
@@ -146,7 +148,7 @@ final class RandomForestClassificationModel private[ml] (
}
override def copy(extra: ParamMap): RandomForestClassificationModel = {
- copyValues(new RandomForestClassificationModel(parent, _trees), extra)
+ copyValues(new RandomForestClassificationModel(uid, _trees), extra)
}
override def toString: String = {
@@ -172,6 +174,7 @@ private[ml] object RandomForestClassificationModel {
// parent, fittingParamMap for each tree is null since there are no good ways to set these.
DecisionTreeClassificationModel.fromOld(tree, null, categoricalFeatures)
}
- new RandomForestClassificationModel(parent, newTrees)
+ val uid = if (parent != null) parent.uid else Identifiable.randomUID("rfc")
+ new RandomForestClassificationModel(uid, newTrees)
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index e5a73c6087a11..c1af09c9694ba 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.Evaluator
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.{DataFrame, Row}
@@ -33,7 +33,10 @@ import org.apache.spark.sql.types.DoubleType
* Evaluator for binary classification, which expects two input columns: score and label.
*/
@AlphaComponent
-class BinaryClassificationEvaluator extends Evaluator with HasRawPredictionCol with HasLabelCol {
+class BinaryClassificationEvaluator(override val uid: String)
+ extends Evaluator with HasRawPredictionCol with HasLabelCol {
+
+ def this() = this(Identifiable.randomUID("binEval"))
/**
* param for metric name in evaluation
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
index 6eb1db6971111..62f4a6343423e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.BinaryAttribute
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
-import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, StructType}
@@ -32,7 +32,10 @@ import org.apache.spark.sql.types.{DoubleType, StructType}
* Binarize a column of continuous features given a threshold.
*/
@AlphaComponent
-final class Binarizer extends Transformer with HasInputCol with HasOutputCol {
+final class Binarizer(override val uid: String)
+ extends Transformer with HasInputCol with HasOutputCol {
+
+ def this() = this(Identifiable.randomUID("binarizer"))
/**
* Param for threshold used to binarize continuous features.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index d8f1961cb380a..ac8dfb5632a7b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -21,11 +21,11 @@ import java.{util => ju}
import org.apache.spark.SparkException
import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.ml.Model
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
-import org.apache.spark.ml.util.SchemaUtils
-import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
@@ -35,10 +35,10 @@ import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
* `Bucketizer` maps a column of continuous features to a column of feature buckets.
*/
@AlphaComponent
-final class Bucketizer private[ml] (override val parent: Estimator[Bucketizer])
+final class Bucketizer(override val uid: String)
extends Model[Bucketizer] with HasInputCol with HasOutputCol {
- def this() = this(null)
+ def this() = this(Identifiable.randomUID("bucketizer"))
/**
* Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
index f8b56293e3ccc..8b32eee0e490a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
@@ -20,6 +20,7 @@ package org.apache.spark.ml.feature
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.Param
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.types.DataType
@@ -31,7 +32,10 @@ import org.apache.spark.sql.types.DataType
* multiplier.
*/
@AlphaComponent
-class ElementwiseProduct extends UnaryTransformer[Vector, Vector, ElementwiseProduct] {
+class ElementwiseProduct(override val uid: String)
+ extends UnaryTransformer[Vector, Vector, ElementwiseProduct] {
+
+ def this() = this(Identifiable.randomUID("elemProd"))
/**
* the vector to multiply with input vectors
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index c305a819a8966..30033ced68a04 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -20,6 +20,7 @@ package org.apache.spark.ml.feature
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.{IntParam, ParamValidators}
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.types.DataType
@@ -29,7 +30,9 @@ import org.apache.spark.sql.types.DataType
* Maps a sequence of terms to their term frequencies using the hashing trick.
*/
@AlphaComponent
-class HashingTF extends UnaryTransformer[Iterable[_], Vector, HashingTF] {
+class HashingTF(override val uid: String) extends UnaryTransformer[Iterable[_], Vector, HashingTF] {
+
+ def this() = this(Identifiable.randomUID("hashingTF"))
/**
* Number of features. Should be > 0.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
index d901a20aed002..788c392050c2d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml._
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql._
@@ -62,7 +62,9 @@ private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol
* Compute the Inverse Document Frequency (IDF) given a collection of documents.
*/
@AlphaComponent
-final class IDF extends Estimator[IDFModel] with IDFBase {
+final class IDF(override val uid: String) extends Estimator[IDFModel] with IDFBase {
+
+ def this() = this(Identifiable.randomUID("idf"))
/** @group setParam */
def setInputCol(value: String): this.type = set(inputCol, value)
@@ -74,7 +76,7 @@ final class IDF extends Estimator[IDFModel] with IDFBase {
transformSchema(dataset.schema, logging = true)
val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v }
val idf = new feature.IDF($(minDocFreq)).fit(input)
- copyValues(new IDFModel(this, idf))
+ copyValues(new IDFModel(uid, idf).setParent(this))
}
override def transformSchema(schema: StructType): StructType = {
@@ -88,7 +90,7 @@ final class IDF extends Estimator[IDFModel] with IDFBase {
*/
@AlphaComponent
class IDFModel private[ml] (
- override val parent: IDF,
+ override val uid: String,
idfModel: feature.IDFModel)
extends Model[IDFModel] with IDFBase {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
index 755b46a64c7f1..3f689d1585cd6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
@@ -20,6 +20,7 @@ package org.apache.spark.ml.feature
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.{DoubleParam, ParamValidators}
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.types.DataType
@@ -29,7 +30,9 @@ import org.apache.spark.sql.types.DataType
* Normalize a vector to have unit norm using the given p-norm.
*/
@AlphaComponent
-class Normalizer extends UnaryTransformer[Vector, Vector, Normalizer] {
+class Normalizer(override val uid: String) extends UnaryTransformer[Vector, Vector, Normalizer] {
+
+ def this() = this(Identifiable.randomUID("normalizer"))
/**
* Normalization in L^p^ space. Must be >= 1.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index 46514ae5f0e84..1fb9b9ae75091 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -24,7 +24,7 @@ import org.apache.spark.ml.attribute.{Attribute, BinaryAttribute, NominalAttribu
import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
-import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
/**
@@ -37,8 +37,10 @@ import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
* linearly dependent because they sum up to one.
*/
@AlphaComponent
-class OneHotEncoder extends UnaryTransformer[Double, Vector, OneHotEncoder]
- with HasInputCol with HasOutputCol {
+class OneHotEncoder(override val uid: String)
+ extends UnaryTransformer[Double, Vector, OneHotEncoder] with HasInputCol with HasOutputCol {
+
+ def this() = this(Identifiable.randomUID("oneHot"))
/**
* Whether to include a component in the encoded vectors for the first category, defaults to true.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index 9e6177ca27e4a..41564410e4965 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -22,6 +22,7 @@ import scala.collection.mutable
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.{IntParam, ParamValidators}
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.linalg._
import org.apache.spark.sql.types.DataType
@@ -34,7 +35,10 @@ import org.apache.spark.sql.types.DataType
* `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.
*/
@AlphaComponent
-class PolynomialExpansion extends UnaryTransformer[Vector, Vector, PolynomialExpansion] {
+class PolynomialExpansion(override val uid: String)
+ extends UnaryTransformer[Vector, Vector, PolynomialExpansion] {
+
+ def this() = this(Identifiable.randomUID("poly"))
/**
* The polynomial degree to expand, which should be >= 1. A value of 1 means no expansion.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index 7cad59ff3fa37..5ccda15d872ed 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -21,6 +21,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml._
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql._
@@ -55,7 +56,10 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with
* statistics on the samples in the training set.
*/
@AlphaComponent
-class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerParams {
+class StandardScaler(override val uid: String) extends Estimator[StandardScalerModel]
+ with StandardScalerParams {
+
+ def this() = this(Identifiable.randomUID("stdScal"))
setDefault(withMean -> false, withStd -> true)
@@ -76,7 +80,7 @@ class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerP
val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v }
val scaler = new feature.StandardScaler(withMean = $(withMean), withStd = $(withStd))
val scalerModel = scaler.fit(input)
- copyValues(new StandardScalerModel(this, scalerModel))
+ copyValues(new StandardScalerModel(uid, scalerModel).setParent(this))
}
override def transformSchema(schema: StructType): StructType = {
@@ -96,7 +100,7 @@ class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerP
*/
@AlphaComponent
class StandardScalerModel private[ml] (
- override val parent: StandardScaler,
+ override val uid: String,
scaler: feature.StandardScalerModel)
extends Model[StandardScalerModel] with StandardScalerParams {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 3d78537ad84cb..3f79b67309f07 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -23,6 +23,7 @@ import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{NumericType, StringType, StructType}
@@ -58,7 +59,10 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
* So the most frequent label gets index 0.
*/
@AlphaComponent
-class StringIndexer extends Estimator[StringIndexerModel] with StringIndexerBase {
+class StringIndexer(override val uid: String) extends Estimator[StringIndexerModel]
+ with StringIndexerBase {
+
+ def this() = this(Identifiable.randomUID("strIdx"))
/** @group setParam */
def setInputCol(value: String): this.type = set(inputCol, value)
@@ -73,7 +77,7 @@ class StringIndexer extends Estimator[StringIndexerModel] with StringIndexerBase
.map(_.getString(0))
.countByValue()
val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray
- copyValues(new StringIndexerModel(this, labels))
+ copyValues(new StringIndexerModel(uid, labels).setParent(this))
}
override def transformSchema(schema: StructType): StructType = {
@@ -87,7 +91,7 @@ class StringIndexer extends Estimator[StringIndexerModel] with StringIndexerBase
*/
@AlphaComponent
class StringIndexerModel private[ml] (
- override val parent: StringIndexer,
+ override val uid: String,
labels: Array[String]) extends Model[StringIndexerModel] with StringIndexerBase {
private val labelToIndex: OpenHashMap[String, Double] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 649c217b16590..36d9e17eca41b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -20,6 +20,7 @@ package org.apache.spark.ml.feature
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
/**
@@ -27,7 +28,9 @@ import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
* A tokenizer that converts the input string to lowercase and then splits it by white spaces.
*/
@AlphaComponent
-class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] {
+class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[String], Tokenizer] {
+
+ def this() = this(Identifiable.randomUID("tok"))
override protected def createTransformFunc: String => Seq[String] = {
_.toLowerCase.split("\\s")
@@ -48,7 +51,10 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] {
* It returns an array of strings that can be empty.
*/
@AlphaComponent
-class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenizer] {
+class RegexTokenizer(override val uid: String)
+ extends UnaryTransformer[String, Seq[String], RegexTokenizer] {
+
+ def this() = this(Identifiable.randomUID("regexTok"))
/**
* Minimum token length, >= 0.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 796758a70ef18..1c0009476908c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -23,6 +23,7 @@ import org.apache.spark.SparkException
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions._
@@ -33,7 +34,10 @@ import org.apache.spark.sql.types._
* A feature transformer that merges multiple columns into a vector column.
*/
@AlphaComponent
-class VectorAssembler extends Transformer with HasInputCols with HasOutputCol {
+class VectorAssembler(override val uid: String)
+ extends Transformer with HasInputCols with HasOutputCol {
+
+ def this() = this(Identifiable.randomUID("va"))
/** @group setParam */
def setInputCols(value: Array[String]): this.type = set(inputCols, value)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index 0f83a29c86bf6..6d1d0524e59ee 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.attribute._
import org.apache.spark.ml.param.{IntParam, ParamValidators, Params}
import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, VectorUDT}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions.callUDF
@@ -87,7 +87,10 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
* - Add option for allowing unknown categories.
*/
@AlphaComponent
-class VectorIndexer extends Estimator[VectorIndexerModel] with VectorIndexerParams {
+class VectorIndexer(override val uid: String) extends Estimator[VectorIndexerModel]
+ with VectorIndexerParams {
+
+ def this() = this(Identifiable.randomUID("vecIdx"))
/** @group setParam */
def setMaxCategories(value: Int): this.type = set(maxCategories, value)
@@ -110,7 +113,9 @@ class VectorIndexer extends Estimator[VectorIndexerModel] with VectorIndexerPara
iter.foreach(localCatStats.addVector)
Iterator(localCatStats)
}.reduce((stats1, stats2) => stats1.merge(stats2))
- copyValues(new VectorIndexerModel(this, numFeatures, categoryStats.getCategoryMaps))
+ val model = new VectorIndexerModel(uid, numFeatures, categoryStats.getCategoryMaps)
+ .setParent(this)
+ copyValues(model)
}
override def transformSchema(schema: StructType): StructType = {
@@ -238,7 +243,7 @@ private object VectorIndexer {
*/
@AlphaComponent
class VectorIndexerModel private[ml] (
- override val parent: VectorIndexer,
+ override val uid: String,
val numFeatures: Int,
val categoryMaps: Map[Int, Map[Double, Int]])
extends Model[VectorIndexerModel] with VectorIndexerParams {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 34ff92970129f..8ace8c53bb663 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
import org.apache.spark.mllib.linalg.BLAS._
@@ -85,7 +85,9 @@ private[feature] trait Word2VecBase extends Params
* natural language processing or machine learning process.
*/
@AlphaComponent
-final class Word2Vec extends Estimator[Word2VecModel] with Word2VecBase {
+final class Word2Vec(override val uid: String) extends Estimator[Word2VecModel] with Word2VecBase {
+
+ def this() = this(Identifiable.randomUID("w2v"))
/** @group setParam */
def setInputCol(value: String): this.type = set(inputCol, value)
@@ -122,7 +124,7 @@ final class Word2Vec extends Estimator[Word2VecModel] with Word2VecBase {
.setSeed($(seed))
.setVectorSize($(vectorSize))
.fit(input)
- copyValues(new Word2VecModel(this, wordVectors))
+ copyValues(new Word2VecModel(uid, wordVectors).setParent(this))
}
override def transformSchema(schema: StructType): StructType = {
@@ -136,7 +138,7 @@ final class Word2Vec extends Estimator[Word2VecModel] with Word2VecBase {
*/
@AlphaComponent
class Word2VecModel private[ml] (
- override val parent: Word2Vec,
+ override val uid: String,
wordVectors: feature.Word2VecModel)
extends Model[Word2VecModel] with Word2VecBase {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 5a7ec29aac6cc..247e08be1bb15 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -40,12 +40,17 @@ import org.apache.spark.ml.util.Identifiable
* @tparam T param value type
*/
@AlphaComponent
-class Param[T] (val parent: Params, val name: String, val doc: String, val isValid: T => Boolean)
+class Param[T](val parent: String, val name: String, val doc: String, val isValid: T => Boolean)
extends Serializable {
- def this(parent: Params, name: String, doc: String) =
+ def this(parent: Identifiable, name: String, doc: String, isValid: T => Boolean) =
+ this(parent.uid, name, doc, isValid)
+
+ def this(parent: String, name: String, doc: String) =
this(parent, name, doc, ParamValidators.alwaysTrue[T])
+ def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
+
/**
* Assert that the given value is valid for this parameter.
*
@@ -60,8 +65,7 @@ class Param[T] (val parent: Params, val name: String, val doc: String, val isVal
*/
private[param] def validate(value: T): Unit = {
if (!isValid(value)) {
- throw new IllegalArgumentException(s"$parent parameter $name given invalid value $value." +
- s" Parameter description: $toString")
+ throw new IllegalArgumentException(s"$parent parameter $name given invalid value $value.")
}
}
@@ -75,19 +79,15 @@ class Param[T] (val parent: Params, val name: String, val doc: String, val isVal
*/
def ->(value: T): ParamPair[T] = ParamPair(this, value)
- /**
- * Converts this param's name, doc, and optionally its default value and the user-supplied
- * value in its parent to string.
- */
- override def toString: String = {
- val valueStr = if (parent.isDefined(this)) {
- val defaultValueStr = parent.getDefault(this).map("default: " + _)
- val currentValueStr = parent.get(this).map("current: " + _)
- (defaultValueStr ++ currentValueStr).mkString("(", ", ", ")")
- } else {
- "(undefined)"
+ override final def toString: String = s"${parent}__$name"
+
+ override final def hashCode: Int = toString.##
+
+ override final def equals(obj: Any): Boolean = {
+ obj match {
+ case p: Param[_] => (p.parent == parent) && (p.name == name)
+ case _ => false
}
- s"$name: $doc $valueStr"
}
}
@@ -173,49 +173,71 @@ object ParamValidators {
// specialize primitive-typed params because Java doesn't recognize scala.Double, scala.Int, ...
/** Specialized version of [[Param[Double]]] for Java. */
-class DoubleParam(parent: Params, name: String, doc: String, isValid: Double => Boolean)
+class DoubleParam(parent: String, name: String, doc: String, isValid: Double => Boolean)
extends Param[Double](parent, name, doc, isValid) {
- def this(parent: Params, name: String, doc: String) =
+ def this(parent: String, name: String, doc: String) =
this(parent, name, doc, ParamValidators.alwaysTrue)
+ def this(parent: Identifiable, name: String, doc: String, isValid: Double => Boolean) =
+ this(parent.uid, name, doc, isValid)
+
+ def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
+
override def w(value: Double): ParamPair[Double] = super.w(value)
}
/** Specialized version of [[Param[Int]]] for Java. */
-class IntParam(parent: Params, name: String, doc: String, isValid: Int => Boolean)
+class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolean)
extends Param[Int](parent, name, doc, isValid) {
- def this(parent: Params, name: String, doc: String) =
+ def this(parent: String, name: String, doc: String) =
this(parent, name, doc, ParamValidators.alwaysTrue)
+ def this(parent: Identifiable, name: String, doc: String, isValid: Int => Boolean) =
+ this(parent.uid, name, doc, isValid)
+
+ def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
+
override def w(value: Int): ParamPair[Int] = super.w(value)
}
/** Specialized version of [[Param[Float]]] for Java. */
-class FloatParam(parent: Params, name: String, doc: String, isValid: Float => Boolean)
+class FloatParam(parent: String, name: String, doc: String, isValid: Float => Boolean)
extends Param[Float](parent, name, doc, isValid) {
- def this(parent: Params, name: String, doc: String) =
+ def this(parent: String, name: String, doc: String) =
this(parent, name, doc, ParamValidators.alwaysTrue)
+ def this(parent: Identifiable, name: String, doc: String, isValid: Float => Boolean) =
+ this(parent.uid, name, doc, isValid)
+
+ def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
+
override def w(value: Float): ParamPair[Float] = super.w(value)
}
/** Specialized version of [[Param[Long]]] for Java. */
-class LongParam(parent: Params, name: String, doc: String, isValid: Long => Boolean)
+class LongParam(parent: String, name: String, doc: String, isValid: Long => Boolean)
extends Param[Long](parent, name, doc, isValid) {
- def this(parent: Params, name: String, doc: String) =
+ def this(parent: String, name: String, doc: String) =
this(parent, name, doc, ParamValidators.alwaysTrue)
+ def this(parent: Identifiable, name: String, doc: String, isValid: Long => Boolean) =
+ this(parent.uid, name, doc, isValid)
+
+ def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
+
override def w(value: Long): ParamPair[Long] = super.w(value)
}
/** Specialized version of [[Param[Boolean]]] for Java. */
-class BooleanParam(parent: Params, name: String, doc: String) // No need for isValid
+class BooleanParam(parent: String, name: String, doc: String) // No need for isValid
extends Param[Boolean](parent, name, doc) {
+ def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
+
override def w(value: Boolean): ParamPair[Boolean] = super.w(value)
}
@@ -265,6 +287,9 @@ trait Params extends Identifiable with Serializable {
/**
* Returns all params sorted by their names. The default implementation uses Java reflection to
* list all public methods that have no arguments and return [[Param]].
+ *
+ * Note: Developer should not use this method in constructor because we cannot guarantee that
+ * this variable gets initialized before other params.
*/
lazy val params: Array[Param[_]] = {
val methods = this.getClass.getMethods
@@ -299,15 +324,36 @@ trait Params extends Identifiable with Serializable {
* those are checked during schema validation.
*/
def validateParams(): Unit = {
- params.filter(isDefined _).foreach { param =>
+ params.filter(isDefined).foreach { param =>
param.asInstanceOf[Param[Any]].validate($(param))
}
}
/**
- * Returns the documentation of all params.
+ * Explains a param.
+ * @param param input param, must belong to this instance.
+ * @return a string that contains the input param name, doc, and optionally its default value and
+ * the user-supplied value
+ */
+ def explainParam(param: Param[_]): String = {
+ shouldOwn(param)
+ val valueStr = if (isDefined(param)) {
+ val defaultValueStr = getDefault(param).map("default: " + _)
+ val currentValueStr = get(param).map("current: " + _)
+ (defaultValueStr ++ currentValueStr).mkString("(", ", ", ")")
+ } else {
+ "(undefined)"
+ }
+ s"${param.name}: ${param.doc} $valueStr"
+ }
+
+ /**
+ * Explains all params of this instance.
+ * @see [[explainParam()]]
*/
- def explainParams(): String = params.mkString("\n")
+ def explainParams(): String = {
+ params.map(explainParam).mkString("\n")
+ }
/** Checks whether a param is explicitly set. */
final def isSet(param: Param[_]): Boolean = {
@@ -392,7 +438,6 @@ trait Params extends Identifiable with Serializable {
* @param value the default value
*/
protected final def setDefault[T](param: Param[T], value: T): this.type = {
- shouldOwn(param)
defaultParamMap.put(param, value)
this
}
@@ -430,13 +475,13 @@ trait Params extends Identifiable with Serializable {
}
/**
- * Creates a copy of this instance with a randomly generated uid and some extra params.
- * The default implementation calls the default constructor to create a new instance, then
- * copies the embedded and extra parameters over and returns the new instance.
+ * Creates a copy of this instance with the same UID and some extra params.
+ * The default implementation tries to create a new instance with the same UID.
+ * Then it copies the embedded and extra parameters over and returns the new instance.
* Subclasses should override this method if the default approach is not sufficient.
*/
def copy(extra: ParamMap): Params = {
- val that = this.getClass.newInstance()
+ val that = this.getClass.getConstructor(classOf[String]).newInstance(uid)
copyValues(that, extra)
that
}
@@ -465,7 +510,7 @@ trait Params extends Identifiable with Serializable {
/** Validates that the input param belongs to this instance. */
private def shouldOwn(param: Param[_]): Unit = {
- require(param.parent.eq(this), s"Param $param does not belong to $this.")
+ require(param.parent == uid && hasParam(param.name), s"Param $param does not belong to $this.")
}
/**
@@ -581,7 +626,7 @@ final class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any])
override def toString: String = {
map.toSeq.sortBy(_._1.name).map { case (param, value) =>
- s"\t${param.parent.uid}-${param.name}: $value"
+ s"\t${param.parent}-${param.name}: $value"
}.mkString("{\n", ",\n", "\n}")
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index d7cbffc3be26f..45c57b50da70f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -35,6 +35,7 @@ import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.optimization.NNLS
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
@@ -171,7 +172,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR
* Model fitted by ALS.
*/
class ALSModel private[ml] (
- override val parent: ALS,
+ override val uid: String,
k: Int,
userFactors: RDD[(Int, Array[Float])],
itemFactors: RDD[(Int, Array[Float])])
@@ -235,10 +236,12 @@ class ALSModel private[ml] (
* indicated user
* preferences rather than explicit ratings given to items.
*/
-class ALS extends Estimator[ALSModel] with ALSParams {
+class ALS(override val uid: String) extends Estimator[ALSModel] with ALSParams {
import org.apache.spark.ml.recommendation.ALS.Rating
+ def this() = this(Identifiable.randomUID("als"))
+
/** @group setParam */
def setRank(value: Int): this.type = set(rank, value)
@@ -303,7 +306,8 @@ class ALS extends Estimator[ALSModel] with ALSParams {
maxIter = $(maxIter), regParam = $(regParam), implicitPrefs = $(implicitPrefs),
alpha = $(alpha), nonnegative = $(nonnegative),
checkpointInterval = $(checkpointInterval), seed = $(seed))
- copyValues(new ALSModel(this, $(rank), userFactors, itemFactors))
+ val model = new ALSModel(uid, $(rank), userFactors, itemFactors).setParent(this)
+ copyValues(model)
}
override def transformSchema(schema: StructType): StructType = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index f8f0b161a4812..e67df21b2e4ae 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tree.{TreeRegressorParams, DecisionTreeParams, DecisionTreeModel, Node}
-import org.apache.spark.ml.util.MetadataUtils
+import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree}
@@ -38,10 +38,12 @@ import org.apache.spark.sql.DataFrame
* It supports both continuous and categorical features.
*/
@AlphaComponent
-final class DecisionTreeRegressor
+final class DecisionTreeRegressor(override val uid: String)
extends Predictor[Vector, DecisionTreeRegressor, DecisionTreeRegressionModel]
with DecisionTreeParams with TreeRegressorParams {
+ def this() = this(Identifiable.randomUID("dtr"))
+
// Override parameter setters from parent trait for Java API compatibility.
override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
@@ -91,7 +93,7 @@ object DecisionTreeRegressor {
*/
@AlphaComponent
final class DecisionTreeRegressionModel private[ml] (
- override val parent: DecisionTreeRegressor,
+ override val uid: String,
override val rootNode: Node)
extends PredictionModel[Vector, DecisionTreeRegressionModel]
with DecisionTreeModel with Serializable {
@@ -104,7 +106,7 @@ final class DecisionTreeRegressionModel private[ml] (
}
override def copy(extra: ParamMap): DecisionTreeRegressionModel = {
- copyValues(new DecisionTreeRegressionModel(parent, rootNode), extra)
+ copyValues(new DecisionTreeRegressionModel(uid, rootNode), extra)
}
override def toString: String = {
@@ -128,6 +130,7 @@ private[ml] object DecisionTreeRegressionModel {
s"Cannot convert non-regression DecisionTreeModel (old API) to" +
s" DecisionTreeRegressionModel (new API). Algo is: ${oldModel.algo}")
val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures)
- new DecisionTreeRegressionModel(parent, rootNode)
+ val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtr")
+ new DecisionTreeRegressionModel(uid, rootNode)
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index 461905c12701a..4249ff5c1ebc7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.tree.{GBTParams, TreeRegressorParams, DecisionTreeModel, TreeEnsembleModel}
-import org.apache.spark.ml.util.MetadataUtils
+import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{GradientBoostedTrees => OldGBT}
@@ -42,10 +42,12 @@ import org.apache.spark.sql.DataFrame
* It supports both continuous and categorical features.
*/
@AlphaComponent
-final class GBTRegressor
+final class GBTRegressor(override val uid: String)
extends Predictor[Vector, GBTRegressor, GBTRegressionModel]
with GBTParams with TreeRegressorParams with Logging {
+ def this() = this(Identifiable.randomUID("gbtr"))
+
// Override parameter setters from parent trait for Java API compatibility.
// Parameters from TreeRegressorParams:
@@ -149,7 +151,7 @@ object GBTRegressor {
*/
@AlphaComponent
final class GBTRegressionModel(
- override val parent: GBTRegressor,
+ override val uid: String,
private val _trees: Array[DecisionTreeRegressionModel],
private val _treeWeights: Array[Double])
extends PredictionModel[Vector, GBTRegressionModel]
@@ -173,7 +175,7 @@ final class GBTRegressionModel(
}
override def copy(extra: ParamMap): GBTRegressionModel = {
- copyValues(new GBTRegressionModel(parent, _trees, _treeWeights), extra)
+ copyValues(new GBTRegressionModel(uid, _trees, _treeWeights), extra)
}
override def toString: String = {
@@ -199,6 +201,7 @@ private[ml] object GBTRegressionModel {
// parent, fittingParamMap for each tree is null since there are no good ways to set these.
DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
}
- new GBTRegressionModel(parent, newTrees, oldModel.treeWeights)
+ val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtr")
+ new GBTRegressionModel(parent.uid, newTrees, oldModel.treeWeights)
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 36c242bb5f2a7..3ebb78f79201a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -20,14 +20,14 @@ package org.apache.spark.ml.regression
import scala.collection.mutable
import breeze.linalg.{DenseVector => BDV, norm => brzNorm}
-import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS,
- OWLQN => BreezeOWLQN}
+import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
import org.apache.spark.Logging
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.PredictorParams
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasMaxIter, HasRegParam, HasTol}
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.linalg.BLAS._
import org.apache.spark.mllib.regression.LabeledPoint
@@ -59,9 +59,12 @@ private[regression] trait LinearRegressionParams extends PredictorParams
* - L2 + L1 (elastic net)
*/
@AlphaComponent
-class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegressionModel]
+class LinearRegression(override val uid: String)
+ extends Regressor[Vector, LinearRegression, LinearRegressionModel]
with LinearRegressionParams with Logging {
+ def this() = this(Identifiable.randomUID("linReg"))
+
/**
* Set the regularization parameter.
* Default is 0.0.
@@ -128,7 +131,7 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
logWarning(s"The standard deviation of the label is zero, so the weights will be zeros " +
s"and the intercept will be the mean of the label; as a result, training is not needed.")
if (handlePersistence) instances.unpersist()
- return new LinearRegressionModel(this, Vectors.sparse(numFeatures, Seq()), yMean)
+ return new LinearRegressionModel(uid, Vectors.sparse(numFeatures, Seq()), yMean)
}
val featuresMean = summarizer.mean.toArray
@@ -182,7 +185,7 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
if (handlePersistence) instances.unpersist()
// TODO: Converts to sparse format based on the storage, but may base on the scoring speed.
- new LinearRegressionModel(this, weights.compressed, intercept)
+ copyValues(new LinearRegressionModel(uid, weights.compressed, intercept))
}
}
@@ -193,7 +196,7 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
*/
@AlphaComponent
class LinearRegressionModel private[ml] (
- override val parent: LinearRegression,
+ override val uid: String,
val weights: Vector,
val intercept: Double)
extends RegressionModel[Vector, LinearRegressionModel]
@@ -204,7 +207,7 @@ class LinearRegressionModel private[ml] (
}
override def copy(extra: ParamMap): LinearRegressionModel = {
- copyValues(new LinearRegressionModel(parent, weights, intercept), extra)
+ copyValues(new LinearRegressionModel(uid, weights, intercept), extra)
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index dbc628927433d..82437aa8de294 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tree.{RandomForestParams, TreeRegressorParams, DecisionTreeModel, TreeEnsembleModel}
-import org.apache.spark.ml.util.MetadataUtils
+import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{RandomForest => OldRandomForest}
@@ -37,10 +37,12 @@ import org.apache.spark.sql.DataFrame
* It supports both continuous and categorical features.
*/
@AlphaComponent
-final class RandomForestRegressor
+final class RandomForestRegressor(override val uid: String)
extends Predictor[Vector, RandomForestRegressor, RandomForestRegressionModel]
with RandomForestParams with TreeRegressorParams {
+ def this() = this(Identifiable.randomUID("rfr"))
+
// Override parameter setters from parent trait for Java API compatibility.
// Parameters from TreeRegressorParams:
@@ -105,7 +107,7 @@ object RandomForestRegressor {
*/
@AlphaComponent
final class RandomForestRegressionModel private[ml] (
- override val parent: RandomForestRegressor,
+ override val uid: String,
private val _trees: Array[DecisionTreeRegressionModel])
extends PredictionModel[Vector, RandomForestRegressionModel]
with TreeEnsembleModel with Serializable {
@@ -128,7 +130,7 @@ final class RandomForestRegressionModel private[ml] (
}
override def copy(extra: ParamMap): RandomForestRegressionModel = {
- copyValues(new RandomForestRegressionModel(parent, _trees), extra)
+ copyValues(new RandomForestRegressionModel(uid, _trees), extra)
}
override def toString: String = {
@@ -154,6 +156,6 @@ private[ml] object RandomForestRegressionModel {
// parent, fittingParamMap for each tree is null since there are no good ways to set these.
DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
}
- new RandomForestRegressionModel(parent, newTrees)
+ new RandomForestRegressionModel(parent.uid, newTrees)
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index ac0d1fed84b2e..5c6ff2dda3604 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -23,6 +23,7 @@ import org.apache.spark.Logging
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml._
import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType
@@ -81,7 +82,10 @@ private[ml] trait CrossValidatorParams extends Params {
* K-fold cross validation.
*/
@AlphaComponent
-class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorParams with Logging {
+class CrossValidator(override val uid: String) extends Estimator[CrossValidatorModel]
+ with CrossValidatorParams with Logging {
+
+ def this() = this(Identifiable.randomUID("cv"))
private val f2jBLAS = new F2jBLAS
@@ -136,7 +140,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
logInfo(s"Best set of parameters:\n${epm(bestIndex)}")
logInfo(s"Best cross-validation metric: $bestMetric.")
val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]]
- copyValues(new CrossValidatorModel(this, bestModel))
+ copyValues(new CrossValidatorModel(uid, bestModel).setParent(this))
}
override def transformSchema(schema: StructType): StructType = {
@@ -150,7 +154,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
*/
@AlphaComponent
class CrossValidatorModel private[ml] (
- override val parent: CrossValidator,
+ override val uid: String,
val bestModel: Model[_])
extends Model[CrossValidatorModel] with CrossValidatorParams {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala
index 8a56748ab0a02..146697680092c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala
@@ -19,15 +19,24 @@ package org.apache.spark.ml.util
import java.util.UUID
+
/**
- * Object with a unique id.
+ * Trait for an object with an immutable unique ID that identifies itself and its derivatives.
*/
-private[ml] trait Identifiable extends Serializable {
+trait Identifiable {
+
+ /**
+ * An immutable unique ID for the object and its derivatives.
+ */
+ val uid: String
+}
+
+object Identifiable {
/**
- * A unique id for the object. The default implementation concatenates the class name, "_", and 8
- * random hex chars.
+ * Returns a random UID that concatenates the given prefix, "_", and 12 random hex chars.
*/
- private[ml] val uid: String =
- this.getClass.getSimpleName + "_" + UUID.randomUUID().toString.take(8)
+ def randomUID(prefix: String): String = {
+ prefix + "_" + UUID.randomUUID().toString.takeRight(12)
+ }
}
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
index 7e7189a2b1d53..f75e024a713ee 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
@@ -84,7 +84,7 @@ public void logisticRegressionWithSetters() {
.setThreshold(0.6)
.setProbabilityCol("myProbability");
LogisticRegressionModel model = lr.fit(dataset);
- LogisticRegression parent = model.parent();
+ LogisticRegression parent = (LogisticRegression) model.parent();
assert(parent.getMaxIter() == 10);
assert(parent.getRegParam() == 1.0);
assert(parent.getThreshold() == 0.6);
@@ -110,7 +110,7 @@ public void logisticRegressionWithSetters() {
// Call fit() with new params, and check as many params as we can.
LogisticRegressionModel model2 = lr.fit(dataset, lr.maxIter().w(5), lr.regParam().w(0.1),
lr.threshold().w(0.4), lr.probabilityCol().w("theProb"));
- LogisticRegression parent2 = model2.parent();
+ LogisticRegression parent2 = (LogisticRegression) model2.parent();
assert(parent2.getMaxIter() == 5);
assert(parent2.getRegParam() == 0.1);
assert(parent2.getThreshold() == 0.4);
diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
index 8abe575610d19..3a41890b92d63 100644
--- a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
+++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
@@ -21,43 +21,65 @@
import com.google.common.collect.Lists;
+import org.apache.spark.ml.util.Identifiable$;
+
/**
* A subclass of Params for testing.
*/
public class JavaTestParams extends JavaParams {
- public IntParam myIntParam;
+ public JavaTestParams() {
+ this.uid_ = Identifiable$.MODULE$.randomUID("javaTestParams");
+ init();
+ }
+
+ public JavaTestParams(String uid) {
+ this.uid_ = uid;
+ init();
+ }
+
+ private String uid_;
+
+ @Override
+ public String uid() {
+ return uid_;
+ }
- public int getMyIntParam() { return (Integer)getOrDefault(myIntParam); }
+ private IntParam myIntParam_;
+ public IntParam myIntParam() { return myIntParam_; }
+
+ public int getMyIntParam() { return (Integer)getOrDefault(myIntParam_); }
public JavaTestParams setMyIntParam(int value) {
- set(myIntParam, value); return this;
+ set(myIntParam_, value); return this;
}
- public DoubleParam myDoubleParam;
+ private DoubleParam myDoubleParam_;
+ public DoubleParam myDoubleParam() { return myDoubleParam_; }
- public double getMyDoubleParam() { return (Double)getOrDefault(myDoubleParam); }
+ public double getMyDoubleParam() { return (Double)getOrDefault(myDoubleParam_); }
public JavaTestParams setMyDoubleParam(double value) {
- set(myDoubleParam, value); return this;
+ set(myDoubleParam_, value); return this;
}
- public Param myStringParam;
+ private Param myStringParam_;
+ public Param myStringParam() { return myStringParam_; }
- public String getMyStringParam() { return (String)getOrDefault(myStringParam); }
+ public String getMyStringParam() { return getOrDefault(myStringParam_); }
public JavaTestParams setMyStringParam(String value) {
- set(myStringParam, value); return this;
+ set(myStringParam_, value); return this;
}
- public JavaTestParams() {
- myIntParam = new IntParam(this, "myIntParam", "this is an int param", ParamValidators.gt(0));
- myDoubleParam = new DoubleParam(this, "myDoubleParam", "this is a double param",
+ private void init() {
+ myIntParam_ = new IntParam(this, "myIntParam", "this is an int param", ParamValidators.gt(0));
+ myDoubleParam_ = new DoubleParam(this, "myDoubleParam", "this is a double param",
ParamValidators.inRange(0.0, 1.0));
List validStrings = Lists.newArrayList("a", "b");
- myStringParam = new Param(this, "myStringParam", "this is a string param",
+ myStringParam_ = new Param(this, "myStringParam", "this is a string param",
ParamValidators.inArray(validStrings));
- setDefault(myIntParam, 1);
- setDefault(myDoubleParam, 0.5);
+ setDefault(myIntParam_, 1);
+ setDefault(myDoubleParam_, 0.5);
}
}
diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
index a82b86d560b6e..d591a456864e4 100644
--- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
@@ -77,14 +77,14 @@ public void linearRegressionWithSetters() {
.setMaxIter(10)
.setRegParam(1.0);
LinearRegressionModel model = lr.fit(dataset);
- LinearRegression parent = model.parent();
+ LinearRegression parent = (LinearRegression) model.parent();
assertEquals(10, parent.getMaxIter());
assertEquals(1.0, parent.getRegParam(), 0.0);
// Call fit() with new params, and check as many params as we can.
LinearRegressionModel model2 =
lr.fit(dataset, lr.maxIter().w(5), lr.regParam().w(0.1), lr.predictionCol().w("thePred"));
- LinearRegression parent2 = model2.parent();
+ LinearRegression parent2 = (LinearRegression) model2.parent();
assertEquals(5, parent2.getMaxIter());
assertEquals(0.1, parent2.getRegParam(), 0.0);
assertEquals("thePred", model2.getPredictionCol());
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index 03af4ecd7a7e0..3fdc66be8a314 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
@@ -268,7 +268,7 @@ private[ml] object DecisionTreeClassifierSuite extends FunSuite {
val newTree = dt.fit(newData)
// Use parent, fittingParamMap from newTree since these are not checked anyways.
val oldTreeAsNew = DecisionTreeClassificationModel.fromOld(
- oldTree, newTree.parent, categoricalFeatures)
+ oldTree, newTree.parent.asInstanceOf[DecisionTreeClassifier], categoricalFeatures)
TreeTests.checkEqual(oldTreeAsNew, newTree)
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index 16c758b82c7cd..ea86867f1161a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -130,7 +130,7 @@ private object GBTClassifierSuite {
val newModel = gbt.fit(newData)
// Use parent, fittingParamMap from newTree since these are not checked anyways.
val oldModelAsNew = GBTClassificationModel.fromOld(
- oldModel, newModel.parent, categoricalFeatures)
+ oldModel, newModel.parent.asInstanceOf[GBTClassifier], categoricalFeatures)
TreeTests.checkEqual(oldModelAsNew, newModel)
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 4df8016009171..43765241a20b6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -19,13 +19,12 @@ package org.apache.spark.ml.classification
import org.scalatest.FunSuite
-import org.apache.spark.mllib.classification.LogisticRegressionSuite
+import org.apache.spark.mllib.classification.LogisticRegressionSuite._
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
-
class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext {
@transient var sqlContext: SQLContext = _
@@ -37,8 +36,7 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext {
super.beforeAll()
sqlContext = new SQLContext(sc)
- dataset = sqlContext.createDataFrame(sc.parallelize(LogisticRegressionSuite
- .generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42), 4))
+ dataset = sqlContext.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42))
/**
* Here is the instruction describing how to export the test data into CSV format
@@ -60,31 +58,30 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext {
val xMean = Array(5.843, 3.057, 3.758, 1.199)
val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
- val testData = LogisticRegressionSuite.generateMultinomialLogisticInput(
- weights, xMean, xVariance, true, nPoints, 42)
+ val testData = generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)
- sqlContext.createDataFrame(sc.parallelize(LogisticRegressionSuite
- .generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42), 4))
+ sqlContext.createDataFrame(
+ generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42))
}
}
test("logistic regression: default params") {
val lr = new LogisticRegression
- assert(lr.getLabelCol == "label")
- assert(lr.getFeaturesCol == "features")
- assert(lr.getPredictionCol == "prediction")
- assert(lr.getRawPredictionCol == "rawPrediction")
- assert(lr.getProbabilityCol == "probability")
- assert(lr.getFitIntercept == true)
+ assert(lr.getLabelCol === "label")
+ assert(lr.getFeaturesCol === "features")
+ assert(lr.getPredictionCol === "prediction")
+ assert(lr.getRawPredictionCol === "rawPrediction")
+ assert(lr.getProbabilityCol === "probability")
+ assert(lr.getFitIntercept)
val model = lr.fit(dataset)
model.transform(dataset)
.select("label", "probability", "prediction", "rawPrediction")
.collect()
assert(model.getThreshold === 0.5)
- assert(model.getFeaturesCol == "features")
- assert(model.getPredictionCol == "prediction")
- assert(model.getRawPredictionCol == "rawPrediction")
- assert(model.getProbabilityCol == "probability")
+ assert(model.getFeaturesCol === "features")
+ assert(model.getPredictionCol === "prediction")
+ assert(model.getRawPredictionCol === "rawPrediction")
+ assert(model.getProbabilityCol === "probability")
assert(model.intercept !== 0.0)
}
@@ -103,7 +100,7 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext {
.setThreshold(0.6)
.setProbabilityCol("myProbability")
val model = lr.fit(dataset)
- val parent = model.parent
+ val parent = model.parent.asInstanceOf[LogisticRegression]
assert(parent.getMaxIter === 10)
assert(parent.getRegParam === 1.0)
assert(parent.getThreshold === 0.6)
@@ -129,12 +126,12 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext {
// Call fit() with new params, and check as many params as we can.
val model2 = lr.fit(dataset, lr.maxIter -> 5, lr.regParam -> 0.1, lr.threshold -> 0.4,
lr.probabilityCol -> "theProb")
- val parent2 = model2.parent
+ val parent2 = model2.parent.asInstanceOf[LogisticRegression]
assert(parent2.getMaxIter === 5)
assert(parent2.getRegParam === 0.1)
assert(parent2.getThreshold === 0.4)
assert(model2.getThreshold === 0.4)
- assert(model2.getProbabilityCol == "theProb")
+ assert(model2.getProbabilityCol === "theProb")
}
test("logistic regression: Predictor, Classifier methods") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index e65ffae918ca9..990cfb08af83b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -57,7 +57,7 @@ class OneVsRestSuite extends FunSuite with MLlibTestSparkContext {
test("one-vs-rest: default params") {
val numClasses = 3
val ova = new OneVsRest()
- ova.setClassifier(new LogisticRegression)
+ .setClassifier(new LogisticRegression)
assert(ova.getLabelCol === "label")
assert(ova.getPredictionCol === "prediction")
val ovaModel = ova.fit(dataset)
@@ -97,7 +97,9 @@ class OneVsRestSuite extends FunSuite with MLlibTestSparkContext {
}
}
-private class MockLogisticRegression extends LogisticRegression {
+private class MockLogisticRegression(uid: String) extends LogisticRegression(uid) {
+
+ def this() = this("mockLogReg")
setMaxIter(1)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index c41def9330504..08f86fa45bc1d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -160,7 +160,7 @@ private object RandomForestClassifierSuite {
val newModel = rf.fit(newData)
// Use parent, fittingParamMap from newTree since these are not checked anyways.
val oldModelAsNew = RandomForestClassificationModel.fromOld(
- oldModel, newModel.parent, categoricalFeatures)
+ oldModel, newModel.parent.asInstanceOf[RandomForestClassifier], categoricalFeatures)
TreeTests.checkEqual(oldModelAsNew, newModel)
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index 6056e7d3f6ff8..b96874f3a8821 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -23,21 +23,22 @@ class ParamsSuite extends FunSuite {
test("param") {
val solver = new TestParams()
+ val uid = solver.uid
import solver.{maxIter, inputCol}
assert(maxIter.name === "maxIter")
assert(maxIter.doc === "max number of iterations (>= 0)")
- assert(maxIter.parent.eq(solver))
- assert(maxIter.toString === "maxIter: max number of iterations (>= 0) (default: 10)")
+ assert(maxIter.parent === uid)
+ assert(maxIter.toString === s"${uid}__maxIter")
assert(!maxIter.isValid(-1))
assert(maxIter.isValid(0))
assert(maxIter.isValid(1))
solver.setMaxIter(5)
- assert(maxIter.toString ===
+ assert(solver.explainParam(maxIter) ===
"maxIter: max number of iterations (>= 0) (default: 10, current: 5)")
- assert(inputCol.toString === "inputCol: input column name (undefined)")
+ assert(inputCol.toString === s"${uid}__inputCol")
intercept[IllegalArgumentException] {
solver.setMaxIter(-1)
@@ -118,7 +119,10 @@ class ParamsSuite extends FunSuite {
assert(!solver.isDefined(inputCol))
intercept[NoSuchElementException](solver.getInputCol)
- assert(solver.explainParams() === Seq(inputCol, maxIter).mkString("\n"))
+ assert(solver.explainParam(maxIter) ===
+ "maxIter: max number of iterations (>= 0) (default: 10, current: 100)")
+ assert(solver.explainParams() ===
+ Seq(inputCol, maxIter).map(solver.explainParam).mkString("\n"))
assert(solver.getParam("inputCol").eq(inputCol))
assert(solver.getParam("maxIter").eq(maxIter))
@@ -148,7 +152,7 @@ class ParamsSuite extends FunSuite {
assert(!solver.isSet(maxIter))
val copied = solver.copy(ParamMap(solver.maxIter -> 50))
- assert(copied.uid !== solver.uid)
+ assert(copied.uid === solver.uid)
assert(copied.getInputCol === solver.getInputCol)
assert(copied.getMaxIter === 50)
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala b/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
index dc16073640407..a9e78366ad98f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
@@ -18,9 +18,12 @@
package org.apache.spark.ml.param
import org.apache.spark.ml.param.shared.{HasInputCol, HasMaxIter}
+import org.apache.spark.ml.util.Identifiable
/** A subclass of Params for testing. */
-class TestParams extends Params with HasMaxIter with HasInputCol {
+class TestParams(override val uid: String) extends Params with HasMaxIter with HasInputCol {
+
+ def this() = this(Identifiable.randomUID("testParams"))
def setMaxIter(value: Int): this.type = { set(maxIter, value); this }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
index 5aa81b44ddaf9..1196a772dfdd4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
@@ -85,7 +85,7 @@ private[ml] object DecisionTreeRegressorSuite extends FunSuite {
val newTree = dt.fit(newData)
// Use parent, fittingParamMap from newTree since these are not checked anyways.
val oldTreeAsNew = DecisionTreeRegressionModel.fromOld(
- oldTree, newTree.parent, categoricalFeatures)
+ oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures)
TreeTests.checkEqual(oldTreeAsNew, newTree)
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index 25b36ab08b67c..40e7e3273e965 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -130,7 +130,8 @@ private object GBTRegressorSuite {
val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
val newModel = gbt.fit(newData)
// Use parent, fittingParamMap from newTree since these are not checked anyways.
- val oldModelAsNew = GBTRegressionModel.fromOld(oldModel, newModel.parent, categoricalFeatures)
+ val oldModelAsNew = GBTRegressionModel.fromOld(
+ oldModel, newModel.parent.asInstanceOf[GBTRegressor], categoricalFeatures)
TreeTests.checkEqual(oldModelAsNew, newModel)
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
index 45f09f4fdab81..3efffbb763b78 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
@@ -116,7 +116,7 @@ private object RandomForestRegressorSuite extends FunSuite {
val newModel = rf.fit(newData)
// Use parent, fittingParamMap from newTree since these are not checked anyways.
val oldModelAsNew = RandomForestRegressionModel.fromOld(
- oldModel, newModel.parent, categoricalFeatures)
+ oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures)
TreeTests.checkEqual(oldModelAsNew, newModel)
}
}
diff --git a/pom.xml b/pom.xml
index 564a443466e5a..91d1d843c762a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -122,9 +122,9 @@
1.7.10
1.2.17
2.2.0
- 2.4.1
+ 2.5.0
${hadoop.version}
- 0.98.7-hadoop1
+ 0.98.7-hadoop2
hbase
1.4.0
3.4.5
@@ -143,7 +143,7 @@
2.0.8
3.1.0
1.7.7
-
+ hadoop2
0.7.1
1.8.3
1.1.0
@@ -155,7 +155,7 @@
${scala.version}
org.scala-lang
3.6.3
- 1.8.8
+ 1.9.13
2.4.4
1.1.1.7
1.1.2
@@ -1644,26 +1644,27 @@
-->
- hadoop-2.2
+ hadoop-1
- 2.2.0
- 2.5.0
- 0.98.7-hadoop2
- hadoop2
- 1.9.13
+ 1.0.4
+ 2.4.1
+ 0.98.7-hadoop1
+ hadoop1
+ 1.8.8
+
+ hadoop-2.2
+
+
+
hadoop-2.3
2.3.0
- 2.5.0
0.9.3
- 0.98.7-hadoop2
3.1.1
- hadoop2
- 1.9.13
@@ -1671,12 +1672,8 @@
hadoop-2.4
2.4.0
- 2.5.0
0.9.3
- 0.98.7-hadoop2
3.1.1
- hadoop2
- 1.9.13
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index a4c61149dd975..4baeeb5b58c2d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -322,6 +322,11 @@ class Analyzer(
case oldVersion @ Aggregate(_, aggregateExpressions, _)
if findAliases(aggregateExpressions).intersect(conflictingAttributes).nonEmpty =>
(oldVersion, oldVersion.copy(aggregateExpressions = newAliases(aggregateExpressions)))
+
+ case oldVersion @ Window(_, windowExpressions, _, child)
+ if AttributeSet(windowExpressions.map(_.toAttribute)).intersect(conflictingAttributes)
+ .nonEmpty =>
+ (oldVersion, oldVersion.copy(windowExpressions = newAliases(windowExpressions)))
}.headOption.getOrElse { // Only handle first case, others will be fixed on the next pass.
sys.error(
s"""
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index eaa9d6aad1f31..5c7152e2140db 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -763,4 +763,14 @@ class SQLQuerySuite extends QueryTest {
sql("SELECT CASE k WHEN 2 THEN 22 WHEN 4 THEN 44 ELSE 0 END, v FROM t"),
Row(0, "1") :: Row(22, "2") :: Row(0, "3") :: Row(44, "4") :: Row(0, "5") :: Nil)
}
+
+ test("SPARK-7595: Window will cause resolve failed with self join") {
+ checkAnswer(sql(
+ """
+ |with
+ | v1 as (select key, count(value) over (partition by key) cnt_val from src),
+ | v2 as (select v1.key, v1_lag.cnt_val from v1, v1 v1_lag where v1.key = v1_lag.key)
+ | select * from v2 order by key limit 1
+ """.stripMargin), Row(0, 3))
+ }
}
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 7c8c3613e7a05..00d219f836708 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -30,6 +30,7 @@
Spark Project YARN
yarn
+ 1.9
@@ -85,7 +86,12 @@
jetty-servlet
-
+
+
+
org.apache.hadoop
hadoop-yarn-server-tests
@@ -97,59 +103,44 @@
mockito-all
test
+
+ org.mortbay.jetty
+ jetty
+ 6.1.26
+
+
+ org.mortbay.jetty
+ servlet-api
+
+
+ test
+
+
+ com.sun.jersey
+ jersey-core
+ ${jersey.version}
+ test
+
+
+ com.sun.jersey
+ jersey-json
+ ${jersey.version}
+ test
+
+
+ stax
+ stax-api
+
+
+
+
+ com.sun.jersey
+ jersey-server
+ ${jersey.version}
+ test
+
-
-
-
-
- hadoop-2.2
-
- 1.9
-
-
-
- org.mortbay.jetty
- jetty
- 6.1.26
-
-
- org.mortbay.jetty
- servlet-api
-
-
- test
-
-
- com.sun.jersey
- jersey-core
- ${jersey.version}
- test
-
-
- com.sun.jersey
- jersey-json
- ${jersey.version}
- test
-
-
- stax
- stax-api
-
-
-
-
- com.sun.jersey
- jersey-server
- ${jersey.version}
- test
-
-
-
-
-
+
target/scala-${scala.binary.version}/classes
target/scala-${scala.binary.version}/test-classes