Merge branch 'master' of git://git.apache.org/spark into SPARK-2583

xiliu82 · Jul 22, 2014 · ffaa83d · ffaa83d
2 parents 12d3de8 + 75db174
commit ffaa83d
Show file tree

Hide file tree

Showing 197 changed files with 1,204 additions and 346 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,7 @@ conf/spark-env.sh
 conf/streaming-env.sh
 conf/log4j.properties
 conf/spark-defaults.conf
+conf/hive-site.xml
 docs/_site
 docs/api
 target/

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -134,8 +134,8 @@ class RangePartitioner[K : Ordering : ClassTag, V](
   def getPartition(key: Any): Int = {
     val k = key.asInstanceOf[K]
     var partition = 0
-    if (rangeBounds.length < 1000) {
-      // If we have less than 100 partitions naive search
+    if (rangeBounds.length <= 128) {
+      // If we have less than 128 partitions naive search
       while (partition < rangeBounds.length && ordering.gt(k, rangeBounds(partition))) {
         partition += 1
       }

diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -99,7 +99,6 @@ class TaskMetrics extends Serializable {
         existingMetrics.fetchWaitTime += newMetrics.fetchWaitTime
         existingMetrics.localBlocksFetched += newMetrics.localBlocksFetched
         existingMetrics.remoteBlocksFetched += newMetrics.remoteBlocksFetched
-        existingMetrics.totalBlocksFetched += newMetrics.totalBlocksFetched
         existingMetrics.remoteBytesRead += newMetrics.remoteBytesRead
       case None =>
         _shuffleReadMetrics = Some(newMetrics)
@@ -149,7 +148,7 @@ class ShuffleReadMetrics extends Serializable {
   /**
    * Number of blocks fetched in this shuffle by this task (remote or local)
    */
-  var totalBlocksFetched: Int = _
+  def totalBlocksFetched: Int = remoteBlocksFetched + localBlocksFetched
 
   /**
    * Number of remote blocks fetched in this shuffle by this task

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1269,11 +1269,55 @@ abstract class RDD[T: ClassTag](
 
   /** A description of this RDD and its recursive dependencies for debugging. */
   def toDebugString: String = {
-    def debugString(rdd: RDD[_], prefix: String = ""): Seq[String] = {
-      Seq(prefix + rdd + " (" + rdd.partitions.size + " partitions)") ++
-        rdd.dependencies.flatMap(d => debugString(d.rdd, prefix + "  "))
+    // Apply a different rule to the last child
+    def debugChildren(rdd: RDD[_], prefix: String): Seq[String] = {
+      val len = rdd.dependencies.length
+      len match {
+        case 0 => Seq.empty
+        case 1 =>
+          val d = rdd.dependencies.head
+          debugString(d.rdd, prefix, d.isInstanceOf[ShuffleDependency[_,_,_]], true)
+        case _ =>
+          val frontDeps = rdd.dependencies.take(len - 1)
+          val frontDepStrings = frontDeps.flatMap(
+            d => debugString(d.rdd, prefix, d.isInstanceOf[ShuffleDependency[_,_,_]]))
+
+          val lastDep = rdd.dependencies.last
+          val lastDepStrings =
+            debugString(lastDep.rdd, prefix, lastDep.isInstanceOf[ShuffleDependency[_,_,_]], true)
+
+          (frontDepStrings ++ lastDepStrings)
+      }
+    }
+    // The first RDD in the dependency stack has no parents, so no need for a +-
+    def firstDebugString(rdd: RDD[_]): Seq[String] = {
+      val partitionStr = "(" + rdd.partitions.size + ")"
+      val leftOffset = (partitionStr.length - 1) / 2
+      val nextPrefix = (" " * leftOffset) + "|" + (" " * (partitionStr.length - leftOffset))
+      Seq(partitionStr + " " + rdd) ++ debugChildren(rdd, nextPrefix)
+    }
+    def shuffleDebugString(rdd: RDD[_], prefix: String = "", isLastChild: Boolean): Seq[String] = {
+      val partitionStr = "(" + rdd.partitions.size + ")"
+      val leftOffset = (partitionStr.length - 1) / 2
+      val thisPrefix = prefix.replaceAll("\\|\\s+$", "")
+      val nextPrefix = (
+        thisPrefix
+        + (if (isLastChild) "  " else "| ")
+        + (" " * leftOffset) + "|" + (" " * (partitionStr.length - leftOffset)))
+      Seq(thisPrefix + "+-" + partitionStr + " " + rdd) ++ debugChildren(rdd, nextPrefix)
+    }
+    def debugString(rdd: RDD[_],
+                    prefix: String = "",
+                    isShuffle: Boolean = true,
+                    isLastChild: Boolean = false): Seq[String] = {
+      if (isShuffle) {
+        shuffleDebugString(rdd, prefix, isLastChild)
+      }
+      else {
+        Seq(prefix + rdd) ++ debugChildren(rdd, prefix)
+      }
     }
-    debugString(this).mkString("\n")
+    firstDebugString(this).mkString("\n")
   }
 
   override def toString: String = "%s%s[%d] at %s".format(

diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
@@ -81,7 +81,6 @@ private[hash] object BlockStoreShuffleFetcher extends Logging {
       shuffleMetrics.shuffleFinishTime = System.currentTimeMillis
       shuffleMetrics.fetchWaitTime = blockFetcherItr.fetchWaitTime
       shuffleMetrics.remoteBytesRead = blockFetcherItr.remoteBytesRead
-      shuffleMetrics.totalBlocksFetched = blockFetcherItr.totalBlocks
       shuffleMetrics.localBlocksFetched = blockFetcherItr.numLocalBlocks
       shuffleMetrics.remoteBlocksFetched = blockFetcherItr.numRemoteBlocks
       context.taskMetrics.updateShuffleReadMetrics(shuffleMetrics)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala
@@ -46,7 +46,6 @@ import org.apache.spark.util.Utils
 private[storage]
 trait BlockFetcherIterator extends Iterator[(BlockId, Option[Iterator[Any]])] with Logging {
   def initialize()
-  def totalBlocks: Int
   def numLocalBlocks: Int
   def numRemoteBlocks: Int
   def fetchWaitTime: Long
@@ -199,7 +198,7 @@ object BlockFetcherIterator {
         }
       }
       logInfo("Getting " + _numBlocksToFetch + " non-empty blocks out of " +
-        totalBlocks + " blocks")
+        (numLocal + numRemote) + " blocks")
       remoteRequests
     }
 
@@ -242,7 +241,6 @@ object BlockFetcherIterator {
       logDebug("Got local blocks in " + Utils.getUsedTimeMs(startTime) + " ms")
     }
 
-    override def totalBlocks: Int = numLocal + numRemote
     override def numLocalBlocks: Int = numLocal
     override def numRemoteBlocks: Int = numRemote
     override def fetchWaitTime: Long = _fetchWaitTime

diff --git a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala
@@ -20,9 +20,9 @@ package org.apache.spark.ui
 private[spark] object ToolTips {
   val SCHEDULER_DELAY =
     """Scheduler delay includes time to ship the task from the scheduler to
-       the executor, and time the time to send a message from the executor to the scheduler stating
-       that the task has completed. When the scheduler becomes overloaded, task completion messages
-       become queued up, and scheduler delay increases."""
+       the executor, and time to send the task result from the executor to the scheduler. If
+       scheduler delay is large, consider decreasing the size of tasks or decreasing the size
+       of task results."""
 
   val INPUT = "Bytes read from Hadoop or from Spark storage."
 

diff --git a/...g/apache/spark/util/CollectionsUtil.scala → .../apache/spark/util/CollectionsUtils.scala b/...g/apache/spark/util/CollectionsUtil.scala → .../apache/spark/util/CollectionsUtils.scala
@@ -19,11 +19,11 @@ package org.apache.spark.util
 
 import java.util
 
-import scala.Array
-import scala.reflect._
+import scala.reflect.{classTag, ClassTag}
 
 private[spark] object CollectionsUtils {
   def makeBinarySearch[K : Ordering : ClassTag] : (Array[K], K) => Int = {
+    // For primitive keys, we can use the natural ordering. Otherwise, use the Ordering comparator.
     classTag[K] match {
       case ClassTag.Float =>
         (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Float]], x.asInstanceOf[Float])
@@ -40,7 +40,8 @@ private[spark] object CollectionsUtils {
       case ClassTag.Long =>
         (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Long]], x.asInstanceOf[Long])
       case _ =>
-        (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[AnyRef]], x)
+        val comparator = implicitly[Ordering[K]].asInstanceOf[java.util.Comparator[Any]]
+        (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[AnyRef]], x, comparator)
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -237,7 +237,6 @@ private[spark] object JsonProtocol {
 
   def shuffleReadMetricsToJson(shuffleReadMetrics: ShuffleReadMetrics): JValue = {
     ("Shuffle Finish Time" -> shuffleReadMetrics.shuffleFinishTime) ~
-    ("Total Blocks Fetched" -> shuffleReadMetrics.totalBlocksFetched) ~
     ("Remote Blocks Fetched" -> shuffleReadMetrics.remoteBlocksFetched) ~
     ("Local Blocks Fetched" -> shuffleReadMetrics.localBlocksFetched) ~
     ("Fetch Wait Time" -> shuffleReadMetrics.fetchWaitTime) ~
@@ -548,7 +547,6 @@ private[spark] object JsonProtocol {
   def shuffleReadMetricsFromJson(json: JValue): ShuffleReadMetrics = {
     val metrics = new ShuffleReadMetrics
     metrics.shuffleFinishTime = (json \ "Shuffle Finish Time").extract[Long]
-    metrics.totalBlocksFetched = (json \ "Total Blocks Fetched").extract[Int]
     metrics.remoteBlocksFetched = (json \ "Remote Blocks Fetched").extract[Int]
     metrics.localBlocksFetched = (json \ "Local Blocks Fetched").extract[Int]
     metrics.fetchWaitTime = (json \ "Fetch Wait Time").extract[Long]

diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -91,6 +91,17 @@ class PartitioningSuite extends FunSuite with SharedSparkContext with PrivateMet
     }
   }
 
+  test("RangePartitioner for keys that are not Comparable (but with Ordering)") {
+    // Row does not extend Comparable, but has an implicit Ordering defined.
+    implicit object RowOrdering extends Ordering[Row] {
+      override def compare(x: Row, y: Row) = x.value - y.value
+    }
+
+    val rdd = sc.parallelize(1 to 4500).map(x => (Row(x), Row(x)))
+    val partitioner = new RangePartitioner(1500, rdd)
+    partitioner.getPartition(Row(100))
+  }
+
   test("HashPartitioner not equal to RangePartitioner") {
     val rdd = sc.parallelize(1 to 10).map(x => (x, x))
     val rangeP2 = new RangePartitioner(2, rdd)
@@ -177,3 +188,6 @@ class PartitioningSuite extends FunSuite with SharedSparkContext with PrivateMet
     // Add other tests here for classes that should be able to handle empty partitions correctly
   }
 }
+
+
+private sealed case class Row(value: Int)
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -314,7 +314,6 @@ class JsonProtocolSuite extends FunSuite {
 
   private def assertEquals(metrics1: ShuffleReadMetrics, metrics2: ShuffleReadMetrics) {
     assert(metrics1.shuffleFinishTime === metrics2.shuffleFinishTime)
-    assert(metrics1.totalBlocksFetched === metrics2.totalBlocksFetched)
     assert(metrics1.remoteBlocksFetched === metrics2.remoteBlocksFetched)
     assert(metrics1.localBlocksFetched === metrics2.localBlocksFetched)
     assert(metrics1.fetchWaitTime === metrics2.fetchWaitTime)
@@ -513,7 +512,6 @@ class JsonProtocolSuite extends FunSuite {
     } else {
       val sr = new ShuffleReadMetrics
       sr.shuffleFinishTime = b + c
-      sr.totalBlocksFetched = e + f
       sr.remoteBytesRead = b + d
       sr.localBlocksFetched = e
       sr.fetchWaitTime = a + d
@@ -584,7 +582,6 @@ class JsonProtocolSuite extends FunSuite {
       |  "Memory Bytes Spilled":800,"Disk Bytes Spilled":0,
       |  "Shuffle Read Metrics":{
       |    "Shuffle Finish Time":900,
-      |    "Total Blocks Fetched":1500,
       |    "Remote Blocks Fetched":800,
       |    "Local Blocks Fetched":700,
       |    "Fetch Wait Time":900,

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
@@ -69,7 +69,54 @@ println("Within Set Sum of Squared Errors = " + WSSSE)
 All of MLlib's methods use Java-friendly types, so you can import and call them there the same
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
-calling `.rdd()` on your `JavaRDD` object.
+calling `.rdd()` on your `JavaRDD` object. A standalone application example
+that is equivalent to the provided example in Scala is given bellow:
+
+{% highlight java %}
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.clustering.KMeans;
+import org.apache.spark.mllib.clustering.KMeansModel;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.SparkConf;
+
+public class KMeansExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("K-means Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    // Load and parse data
+    String path = "data/mllib/kmeans_data.txt";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<Vector> parsedData = data.map(
+      new Function<String, Vector>() {
+        public Vector call(String s) {
+          String[] sarray = s.split(" ");
+          double[] values = new double[sarray.length];
+          for (int i = 0; i < sarray.length; i++)
+            values[i] = Double.parseDouble(sarray[i]);
+          return Vectors.dense(values);
+        }
+      }
+    );
+
+    // Cluster the data into two classes using KMeans
+    int numClusters = 2;
+    int numIterations = 20;
+    KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations);
+
+    // Evaluate clustering by computing Within Set Sum of Squared Errors
+    double WSSSE = clusters.computeCost(parsedData.rdd());
+    System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
+  }
+}
+{% endhighlight %}
+
+In order to run the above standalone application using Spark framework make
+sure that you follow the instructions provided at section [Standalone
+Applications](quick-start.html) of the quick-start guide. What is more, you
+should include to your build file *spark-mllib* as a dependency.
 </div>
 
 <div data-lang="python" markdown="1">

diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
@@ -99,7 +99,85 @@ val model = ALS.trainImplicit(ratings, rank, numIterations, alpha)
 All of MLlib's methods use Java-friendly types, so you can import and call them there the same
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
-calling `.rdd()` on your `JavaRDD` object.
+calling `.rdd()` on your `JavaRDD` object. A standalone application example
+that is equivalent to the provided example in Scala is given bellow:
+
+{% highlight java %}
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.recommendation.ALS;
+import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
+import org.apache.spark.mllib.recommendation.Rating;
+import org.apache.spark.SparkConf;
+
+public class CollaborativeFiltering {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Collaborative Filtering Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    // Load and parse the data
+    String path = "data/mllib/als/test.data";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<Rating> ratings = data.map(
+      new Function<String, Rating>() {
+        public Rating call(String s) {
+          String[] sarray = s.split(",");
+          return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]), 
+                            Double.parseDouble(sarray[2]));
+        }
+      }
+    );
+
+    // Build the recommendation model using ALS
+    int rank = 10;
+    int numIterations = 20;
+    MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01); 
+
+    // Evaluate the model on rating data
+    JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
+      new Function<Rating, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(Rating r) {
+          return new Tuple2<Object, Object>(r.user(), r.product());
+        }
+      }
+    );
+    JavaPairRDD<Tuple2<Integer, Integer>, Double> predictions = JavaPairRDD.fromJavaRDD(
+      model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
+        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
+          public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
+            return new Tuple2<Tuple2<Integer, Integer>, Double>(
+              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+          }
+        }
+    ));
+    JavaRDD<Tuple2<Double, Double>> ratesAndPreds = 
+      JavaPairRDD.fromJavaRDD(ratings.map(
+        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
+          public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
+            return new Tuple2<Tuple2<Integer, Integer>, Double>(
+              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+          }
+        }
+    )).join(predictions).values();
+    double MSE = JavaDoubleRDD.fromRDD(ratesAndPreds.map(
+      new Function<Tuple2<Double, Double>, Object>() {
+        public Object call(Tuple2<Double, Double> pair) {
+          Double err = pair._1() - pair._2();
+          return err * err;
+        }
+      }
+    ).rdd()).mean();
+    System.out.println("Mean Squared Error = " + MSE);
+  }
+}
+{% endhighlight %}
+
+In order to run the above standalone application using Spark framework make
+sure that you follow the instructions provided at section [Standalone
+Applications](quick-start.html) of the quick-start guide. What is more, you
+should include to your build file *spark-mllib* as a dependency.
 </div>
 
 <div data-lang="python" markdown="1">