Skip to content

Commit

Permalink
Merge branch 'master' of git://git.apache.org/spark into SPARK-2583
Browse files Browse the repository at this point in the history
  • Loading branch information
sarutak committed Jul 22, 2014
2 parents 12d3de8 + 75db174 commit ffaa83d
Show file tree
Hide file tree
Showing 197 changed files with 1,204 additions and 346 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ conf/spark-env.sh
conf/streaming-env.sh
conf/log4j.properties
conf/spark-defaults.conf
conf/hive-site.xml
docs/_site
docs/api
target/
Expand Down
4 changes: 2 additions & 2 deletions core/src/main/scala/org/apache/spark/Partitioner.scala
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ class RangePartitioner[K : Ordering : ClassTag, V](
def getPartition(key: Any): Int = {
val k = key.asInstanceOf[K]
var partition = 0
if (rangeBounds.length < 1000) {
// If we have less than 100 partitions naive search
if (rangeBounds.length <= 128) {
// If we have less than 128 partitions naive search
while (partition < rangeBounds.length && ordering.gt(k, rangeBounds(partition))) {
partition += 1
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ class TaskMetrics extends Serializable {
existingMetrics.fetchWaitTime += newMetrics.fetchWaitTime
existingMetrics.localBlocksFetched += newMetrics.localBlocksFetched
existingMetrics.remoteBlocksFetched += newMetrics.remoteBlocksFetched
existingMetrics.totalBlocksFetched += newMetrics.totalBlocksFetched
existingMetrics.remoteBytesRead += newMetrics.remoteBytesRead
case None =>
_shuffleReadMetrics = Some(newMetrics)
Expand Down Expand Up @@ -149,7 +148,7 @@ class ShuffleReadMetrics extends Serializable {
/**
* Number of blocks fetched in this shuffle by this task (remote or local)
*/
var totalBlocksFetched: Int = _
def totalBlocksFetched: Int = remoteBlocksFetched + localBlocksFetched

/**
* Number of remote blocks fetched in this shuffle by this task
Expand Down
52 changes: 48 additions & 4 deletions core/src/main/scala/org/apache/spark/rdd/RDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1269,11 +1269,55 @@ abstract class RDD[T: ClassTag](

/** A description of this RDD and its recursive dependencies for debugging. */
def toDebugString: String = {
def debugString(rdd: RDD[_], prefix: String = ""): Seq[String] = {
Seq(prefix + rdd + " (" + rdd.partitions.size + " partitions)") ++
rdd.dependencies.flatMap(d => debugString(d.rdd, prefix + " "))
// Apply a different rule to the last child
def debugChildren(rdd: RDD[_], prefix: String): Seq[String] = {
val len = rdd.dependencies.length
len match {
case 0 => Seq.empty
case 1 =>
val d = rdd.dependencies.head
debugString(d.rdd, prefix, d.isInstanceOf[ShuffleDependency[_,_,_]], true)
case _ =>
val frontDeps = rdd.dependencies.take(len - 1)
val frontDepStrings = frontDeps.flatMap(
d => debugString(d.rdd, prefix, d.isInstanceOf[ShuffleDependency[_,_,_]]))

val lastDep = rdd.dependencies.last
val lastDepStrings =
debugString(lastDep.rdd, prefix, lastDep.isInstanceOf[ShuffleDependency[_,_,_]], true)

(frontDepStrings ++ lastDepStrings)
}
}
// The first RDD in the dependency stack has no parents, so no need for a +-
def firstDebugString(rdd: RDD[_]): Seq[String] = {
val partitionStr = "(" + rdd.partitions.size + ")"
val leftOffset = (partitionStr.length - 1) / 2
val nextPrefix = (" " * leftOffset) + "|" + (" " * (partitionStr.length - leftOffset))
Seq(partitionStr + " " + rdd) ++ debugChildren(rdd, nextPrefix)
}
def shuffleDebugString(rdd: RDD[_], prefix: String = "", isLastChild: Boolean): Seq[String] = {
val partitionStr = "(" + rdd.partitions.size + ")"
val leftOffset = (partitionStr.length - 1) / 2
val thisPrefix = prefix.replaceAll("\\|\\s+$", "")
val nextPrefix = (
thisPrefix
+ (if (isLastChild) " " else "| ")
+ (" " * leftOffset) + "|" + (" " * (partitionStr.length - leftOffset)))
Seq(thisPrefix + "+-" + partitionStr + " " + rdd) ++ debugChildren(rdd, nextPrefix)
}
def debugString(rdd: RDD[_],
prefix: String = "",
isShuffle: Boolean = true,
isLastChild: Boolean = false): Seq[String] = {
if (isShuffle) {
shuffleDebugString(rdd, prefix, isLastChild)
}
else {
Seq(prefix + rdd) ++ debugChildren(rdd, prefix)
}
}
debugString(this).mkString("\n")
firstDebugString(this).mkString("\n")
}

override def toString: String = "%s%s[%d] at %s".format(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ private[hash] object BlockStoreShuffleFetcher extends Logging {
shuffleMetrics.shuffleFinishTime = System.currentTimeMillis
shuffleMetrics.fetchWaitTime = blockFetcherItr.fetchWaitTime
shuffleMetrics.remoteBytesRead = blockFetcherItr.remoteBytesRead
shuffleMetrics.totalBlocksFetched = blockFetcherItr.totalBlocks
shuffleMetrics.localBlocksFetched = blockFetcherItr.numLocalBlocks
shuffleMetrics.remoteBlocksFetched = blockFetcherItr.numRemoteBlocks
context.taskMetrics.updateShuffleReadMetrics(shuffleMetrics)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ import org.apache.spark.util.Utils
private[storage]
trait BlockFetcherIterator extends Iterator[(BlockId, Option[Iterator[Any]])] with Logging {
def initialize()
def totalBlocks: Int
def numLocalBlocks: Int
def numRemoteBlocks: Int
def fetchWaitTime: Long
Expand Down Expand Up @@ -199,7 +198,7 @@ object BlockFetcherIterator {
}
}
logInfo("Getting " + _numBlocksToFetch + " non-empty blocks out of " +
totalBlocks + " blocks")
(numLocal + numRemote) + " blocks")
remoteRequests
}

Expand Down Expand Up @@ -242,7 +241,6 @@ object BlockFetcherIterator {
logDebug("Got local blocks in " + Utils.getUsedTimeMs(startTime) + " ms")
}

override def totalBlocks: Int = numLocal + numRemote
override def numLocalBlocks: Int = numLocal
override def numRemoteBlocks: Int = numRemote
override def fetchWaitTime: Long = _fetchWaitTime
Expand Down
6 changes: 3 additions & 3 deletions core/src/main/scala/org/apache/spark/ui/ToolTips.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ package org.apache.spark.ui
private[spark] object ToolTips {
val SCHEDULER_DELAY =
"""Scheduler delay includes time to ship the task from the scheduler to
the executor, and time the time to send a message from the executor to the scheduler stating
that the task has completed. When the scheduler becomes overloaded, task completion messages
become queued up, and scheduler delay increases."""
the executor, and time to send the task result from the executor to the scheduler. If
scheduler delay is large, consider decreasing the size of tasks or decreasing the size
of task results."""

val INPUT = "Bytes read from Hadoop or from Spark storage."

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ package org.apache.spark.util

import java.util

import scala.Array
import scala.reflect._
import scala.reflect.{classTag, ClassTag}

private[spark] object CollectionsUtils {
def makeBinarySearch[K : Ordering : ClassTag] : (Array[K], K) => Int = {
// For primitive keys, we can use the natural ordering. Otherwise, use the Ordering comparator.
classTag[K] match {
case ClassTag.Float =>
(l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Float]], x.asInstanceOf[Float])
Expand All @@ -40,7 +40,8 @@ private[spark] object CollectionsUtils {
case ClassTag.Long =>
(l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Long]], x.asInstanceOf[Long])
case _ =>
(l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[AnyRef]], x)
val comparator = implicitly[Ordering[K]].asInstanceOf[java.util.Comparator[Any]]
(l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[AnyRef]], x, comparator)
}
}
}
2 changes: 0 additions & 2 deletions core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,6 @@ private[spark] object JsonProtocol {

def shuffleReadMetricsToJson(shuffleReadMetrics: ShuffleReadMetrics): JValue = {
("Shuffle Finish Time" -> shuffleReadMetrics.shuffleFinishTime) ~
("Total Blocks Fetched" -> shuffleReadMetrics.totalBlocksFetched) ~
("Remote Blocks Fetched" -> shuffleReadMetrics.remoteBlocksFetched) ~
("Local Blocks Fetched" -> shuffleReadMetrics.localBlocksFetched) ~
("Fetch Wait Time" -> shuffleReadMetrics.fetchWaitTime) ~
Expand Down Expand Up @@ -548,7 +547,6 @@ private[spark] object JsonProtocol {
def shuffleReadMetricsFromJson(json: JValue): ShuffleReadMetrics = {
val metrics = new ShuffleReadMetrics
metrics.shuffleFinishTime = (json \ "Shuffle Finish Time").extract[Long]
metrics.totalBlocksFetched = (json \ "Total Blocks Fetched").extract[Int]
metrics.remoteBlocksFetched = (json \ "Remote Blocks Fetched").extract[Int]
metrics.localBlocksFetched = (json \ "Local Blocks Fetched").extract[Int]
metrics.fetchWaitTime = (json \ "Fetch Wait Time").extract[Long]
Expand Down
14 changes: 14 additions & 0 deletions core/src/test/scala/org/apache/spark/PartitioningSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,17 @@ class PartitioningSuite extends FunSuite with SharedSparkContext with PrivateMet
}
}

test("RangePartitioner for keys that are not Comparable (but with Ordering)") {
// Row does not extend Comparable, but has an implicit Ordering defined.
implicit object RowOrdering extends Ordering[Row] {
override def compare(x: Row, y: Row) = x.value - y.value
}

val rdd = sc.parallelize(1 to 4500).map(x => (Row(x), Row(x)))
val partitioner = new RangePartitioner(1500, rdd)
partitioner.getPartition(Row(100))
}

test("HashPartitioner not equal to RangePartitioner") {
val rdd = sc.parallelize(1 to 10).map(x => (x, x))
val rangeP2 = new RangePartitioner(2, rdd)
Expand Down Expand Up @@ -177,3 +188,6 @@ class PartitioningSuite extends FunSuite with SharedSparkContext with PrivateMet
// Add other tests here for classes that should be able to handle empty partitions correctly
}
}


private sealed case class Row(value: Int)
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,6 @@ class JsonProtocolSuite extends FunSuite {

private def assertEquals(metrics1: ShuffleReadMetrics, metrics2: ShuffleReadMetrics) {
assert(metrics1.shuffleFinishTime === metrics2.shuffleFinishTime)
assert(metrics1.totalBlocksFetched === metrics2.totalBlocksFetched)
assert(metrics1.remoteBlocksFetched === metrics2.remoteBlocksFetched)
assert(metrics1.localBlocksFetched === metrics2.localBlocksFetched)
assert(metrics1.fetchWaitTime === metrics2.fetchWaitTime)
Expand Down Expand Up @@ -513,7 +512,6 @@ class JsonProtocolSuite extends FunSuite {
} else {
val sr = new ShuffleReadMetrics
sr.shuffleFinishTime = b + c
sr.totalBlocksFetched = e + f
sr.remoteBytesRead = b + d
sr.localBlocksFetched = e
sr.fetchWaitTime = a + d
Expand Down Expand Up @@ -584,7 +582,6 @@ class JsonProtocolSuite extends FunSuite {
| "Memory Bytes Spilled":800,"Disk Bytes Spilled":0,
| "Shuffle Read Metrics":{
| "Shuffle Finish Time":900,
| "Total Blocks Fetched":1500,
| "Remote Blocks Fetched":800,
| "Local Blocks Fetched":700,
| "Fetch Wait Time":900,
Expand Down
49 changes: 48 additions & 1 deletion docs/mllib-clustering.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,54 @@ println("Within Set Sum of Squared Errors = " + WSSSE)
All of MLlib's methods use Java-friendly types, so you can import and call them there the same
way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
calling `.rdd()` on your `JavaRDD` object.
calling `.rdd()` on your `JavaRDD` object. A standalone application example
that is equivalent to the provided example in Scala is given bellow:

{% highlight java %}
import org.apache.spark.api.java.*;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.clustering.KMeans;
import org.apache.spark.mllib.clustering.KMeansModel;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.SparkConf;

public class KMeansExample {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("K-means Example");
JavaSparkContext sc = new JavaSparkContext(conf);

// Load and parse data
String path = "data/mllib/kmeans_data.txt";
JavaRDD<String> data = sc.textFile(path);
JavaRDD<Vector> parsedData = data.map(
new Function<String, Vector>() {
public Vector call(String s) {
String[] sarray = s.split(" ");
double[] values = new double[sarray.length];
for (int i = 0; i < sarray.length; i++)
values[i] = Double.parseDouble(sarray[i]);
return Vectors.dense(values);
}
}
);

// Cluster the data into two classes using KMeans
int numClusters = 2;
int numIterations = 20;
KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations);

// Evaluate clustering by computing Within Set Sum of Squared Errors
double WSSSE = clusters.computeCost(parsedData.rdd());
System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
}
}
{% endhighlight %}

In order to run the above standalone application using Spark framework make
sure that you follow the instructions provided at section [Standalone
Applications](quick-start.html) of the quick-start guide. What is more, you
should include to your build file *spark-mllib* as a dependency.
</div>

<div data-lang="python" markdown="1">
Expand Down
80 changes: 79 additions & 1 deletion docs/mllib-collaborative-filtering.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,85 @@ val model = ALS.trainImplicit(ratings, rank, numIterations, alpha)
All of MLlib's methods use Java-friendly types, so you can import and call them there the same
way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
calling `.rdd()` on your `JavaRDD` object.
calling `.rdd()` on your `JavaRDD` object. A standalone application example
that is equivalent to the provided example in Scala is given bellow:

{% highlight java %}
import scala.Tuple2;

import org.apache.spark.api.java.*;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.recommendation.ALS;
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
import org.apache.spark.mllib.recommendation.Rating;
import org.apache.spark.SparkConf;

public class CollaborativeFiltering {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Collaborative Filtering Example");
JavaSparkContext sc = new JavaSparkContext(conf);

// Load and parse the data
String path = "data/mllib/als/test.data";
JavaRDD<String> data = sc.textFile(path);
JavaRDD<Rating> ratings = data.map(
new Function<String, Rating>() {
public Rating call(String s) {
String[] sarray = s.split(",");
return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]),
Double.parseDouble(sarray[2]));
}
}
);

// Build the recommendation model using ALS
int rank = 10;
int numIterations = 20;
MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01);

// Evaluate the model on rating data
JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
new Function<Rating, Tuple2<Object, Object>>() {
public Tuple2<Object, Object> call(Rating r) {
return new Tuple2<Object, Object>(r.user(), r.product());
}
}
);
JavaPairRDD<Tuple2<Integer, Integer>, Double> predictions = JavaPairRDD.fromJavaRDD(
model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
return new Tuple2<Tuple2<Integer, Integer>, Double>(
new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
}
}
));
JavaRDD<Tuple2<Double, Double>> ratesAndPreds =
JavaPairRDD.fromJavaRDD(ratings.map(
new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
return new Tuple2<Tuple2<Integer, Integer>, Double>(
new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
}
}
)).join(predictions).values();
double MSE = JavaDoubleRDD.fromRDD(ratesAndPreds.map(
new Function<Tuple2<Double, Double>, Object>() {
public Object call(Tuple2<Double, Double> pair) {
Double err = pair._1() - pair._2();
return err * err;
}
}
).rdd()).mean();
System.out.println("Mean Squared Error = " + MSE);
}
}
{% endhighlight %}

In order to run the above standalone application using Spark framework make
sure that you follow the instructions provided at section [Standalone
Applications](quick-start.html) of the quick-start guide. What is more, you
should include to your build file *spark-mllib* as a dependency.
</div>

<div data-lang="python" markdown="1">
Expand Down
Loading

0 comments on commit ffaa83d

Please sign in to comment.