From e31c8ffca65e0e5cd5f1a6229f3d654a24b7b18c Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Fri, 18 Apr 2014 10:01:16 -0700
Subject: [PATCH 1/9] SPARK-1483: Rename minSplits to minPartitions in public
 APIs

https://issues.apache.org/jira/browse/SPARK-1483

From the original JIRA: " The parameter name is part of the public API in Scala and Python, since you can pass named parameters to a method, so we should name it to this more descriptive term. Everywhere else we refer to "splits" as partitions." - @mateiz

Author: CodingCat <zhunansjtu@gmail.com>

Closes #430 from CodingCat/SPARK-1483 and squashes the following commits:

4b60541 [CodingCat] deprecate defaultMinSplits
ba2c663 [CodingCat] Rename minSplits to minPartitions in public APIs
---
 .../scala/org/apache/spark/SparkContext.scala | 47 ++++++++++---------
 .../spark/api/java/JavaSparkContext.scala     | 37 +++++++++------
 .../input/WholeTextFileInputFormat.scala      |  7 +--
 .../org/apache/spark/rdd/HadoopRDD.scala      | 10 ++--
 .../org/apache/spark/rdd/NewHadoopRDD.scala   |  4 +-
 .../org/apache/spark/mllib/util/MLUtils.scala | 12 ++---
 python/pyspark/context.py                     |  6 +--
 .../apache/spark/sql/hive/TableReader.scala   |  2 +-
 8 files changed, 70 insertions(+), 55 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 3ddc0d5eeefb8..ee5637371fdca 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -427,9 +427,9 @@ class SparkContext(config: SparkConf) extends Logging {
    * Read a text file from HDFS, a local file system (available on all nodes), or any
    * Hadoop-supported file system URI, and return it as an RDD of Strings.
    */
-  def textFile(path: String, minSplits: Int = defaultMinSplits): RDD[String] = {
+  def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String] = {
     hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
-      minSplits).map(pair => pair._2.toString)
+      minPartitions).map(pair => pair._2.toString)
   }
 
   /**
@@ -457,9 +457,10 @@ class SparkContext(config: SparkConf) extends Logging {
    *
    * @note Small files are preferred, large file is also allowable, but may cause bad performance.
    *
-   * @param minSplits A suggestion value of the minimal splitting number for input data.
+   * @param minPartitions A suggestion value of the minimal splitting number for input data.
    */
-  def wholeTextFiles(path: String, minSplits: Int = defaultMinSplits): RDD[(String, String)] = {
+  def wholeTextFiles(path: String, minPartitions: Int = defaultMinPartitions):
+  RDD[(String, String)] = {
     val job = new NewHadoopJob(hadoopConfiguration)
     NewFileInputFormat.addInputPath(job, new Path(path))
     val updateConf = job.getConfiguration
@@ -469,7 +470,7 @@ class SparkContext(config: SparkConf) extends Logging {
       classOf[String],
       classOf[String],
       updateConf,
-      minSplits)
+      minPartitions)
   }
 
   /**
@@ -481,7 +482,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * @param inputFormatClass Class of the InputFormat
    * @param keyClass Class of the keys
    * @param valueClass Class of the values
-   * @param minSplits Minimum number of Hadoop Splits to generate.
+   * @param minPartitions Minimum number of Hadoop Splits to generate.
    *
    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
@@ -493,11 +494,11 @@ class SparkContext(config: SparkConf) extends Logging {
       inputFormatClass: Class[_ <: InputFormat[K, V]],
       keyClass: Class[K],
       valueClass: Class[V],
-      minSplits: Int = defaultMinSplits
+      minPartitions: Int = defaultMinPartitions
       ): RDD[(K, V)] = {
     // Add necessary security credentials to the JobConf before broadcasting it.
     SparkHadoopUtil.get.addCredentials(conf)
-    new HadoopRDD(this, conf, inputFormatClass, keyClass, valueClass, minSplits)
+    new HadoopRDD(this, conf, inputFormatClass, keyClass, valueClass, minPartitions)
   }
 
   /** Get an RDD for a Hadoop file with an arbitrary InputFormat
@@ -512,7 +513,7 @@ class SparkContext(config: SparkConf) extends Logging {
       inputFormatClass: Class[_ <: InputFormat[K, V]],
       keyClass: Class[K],
       valueClass: Class[V],
-      minSplits: Int = defaultMinSplits
+      minPartitions: Int = defaultMinPartitions
       ): RDD[(K, V)] = {
     // A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it.
     val confBroadcast = broadcast(new SerializableWritable(hadoopConfiguration))
@@ -524,7 +525,7 @@ class SparkContext(config: SparkConf) extends Logging {
       inputFormatClass,
       keyClass,
       valueClass,
-      minSplits)
+      minPartitions)
   }
 
   /**
@@ -532,7 +533,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * values and the InputFormat so that users don't need to pass them directly. Instead, callers
    * can just write, for example,
    * {{{
-   * val file = sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](path, minSplits)
+   * val file = sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](path, minPartitions)
    * }}}
    *
    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
@@ -541,13 +542,13 @@ class SparkContext(config: SparkConf) extends Logging {
    * a `map` function.
    */
   def hadoopFile[K, V, F <: InputFormat[K, V]]
-      (path: String, minSplits: Int)
+      (path: String, minPartitions: Int)
       (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = {
     hadoopFile(path,
       fm.runtimeClass.asInstanceOf[Class[F]],
       km.runtimeClass.asInstanceOf[Class[K]],
       vm.runtimeClass.asInstanceOf[Class[V]],
-      minSplits)
+      minPartitions)
   }
 
   /**
@@ -565,7 +566,7 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   def hadoopFile[K, V, F <: InputFormat[K, V]](path: String)
       (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] =
-    hadoopFile[K, V, F](path, defaultMinSplits)
+    hadoopFile[K, V, F](path, defaultMinPartitions)
 
   /** Get an RDD for a Hadoop file with an arbitrary new API InputFormat. */
   def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]]
@@ -626,10 +627,10 @@ class SparkContext(config: SparkConf) extends Logging {
   def sequenceFile[K, V](path: String,
       keyClass: Class[K],
       valueClass: Class[V],
-      minSplits: Int
+      minPartitions: Int
       ): RDD[(K, V)] = {
     val inputFormatClass = classOf[SequenceFileInputFormat[K, V]]
-    hadoopFile(path, inputFormatClass, keyClass, valueClass, minSplits)
+    hadoopFile(path, inputFormatClass, keyClass, valueClass, minPartitions)
   }
 
   /** Get an RDD for a Hadoop SequenceFile with given key and value types.
@@ -641,7 +642,7 @@ class SparkContext(config: SparkConf) extends Logging {
     * */
   def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V]
       ): RDD[(K, V)] =
-    sequenceFile(path, keyClass, valueClass, defaultMinSplits)
+    sequenceFile(path, keyClass, valueClass, defaultMinPartitions)
 
   /**
    * Version of sequenceFile() for types implicitly convertible to Writables through a
@@ -665,7 +666,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * a `map` function.
    */
    def sequenceFile[K, V]
-       (path: String, minSplits: Int = defaultMinSplits)
+       (path: String, minPartitions: Int = defaultMinPartitions)
        (implicit km: ClassTag[K], vm: ClassTag[V],
         kcf: () => WritableConverter[K], vcf: () => WritableConverter[V])
       : RDD[(K, V)] = {
@@ -674,7 +675,7 @@ class SparkContext(config: SparkConf) extends Logging {
     val format = classOf[SequenceFileInputFormat[Writable, Writable]]
     val writables = hadoopFile(path, format,
         kc.writableClass(km).asInstanceOf[Class[Writable]],
-        vc.writableClass(vm).asInstanceOf[Class[Writable]], minSplits)
+        vc.writableClass(vm).asInstanceOf[Class[Writable]], minPartitions)
     writables.map { case (k, v) => (kc.convert(k), vc.convert(v)) }
   }
 
@@ -688,9 +689,9 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   def objectFile[T: ClassTag](
       path: String,
-      minSplits: Int = defaultMinSplits
+      minPartitions: Int = defaultMinPartitions
       ): RDD[T] = {
-    sequenceFile(path, classOf[NullWritable], classOf[BytesWritable], minSplits)
+    sequenceFile(path, classOf[NullWritable], classOf[BytesWritable], minPartitions)
       .flatMap(x => Utils.deserialize[Array[T]](x._2.getBytes))
   }
 
@@ -1183,8 +1184,12 @@ class SparkContext(config: SparkConf) extends Logging {
   def defaultParallelism: Int = taskScheduler.defaultParallelism
 
   /** Default min number of partitions for Hadoop RDDs when not given by user */
+  @deprecated("use defaultMinPartitions", "1.0.0")
   def defaultMinSplits: Int = math.min(defaultParallelism, 2)
 
+  /** Default min number of partitions for Hadoop RDDs when not given by user */
+  def defaultMinPartitions: Int = math.min(defaultParallelism, 2)
+
   private val nextShuffleId = new AtomicInteger(0)
 
   private[spark] def newShuffleId(): Int = nextShuffleId.getAndIncrement()
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index e6a3f06b0ea42..cf30523ab523e 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -109,9 +109,17 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
   /** Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD). */
   def defaultParallelism: java.lang.Integer = sc.defaultParallelism
 
-  /** Default min number of partitions for Hadoop RDDs when not given by user */
+  /**
+   * Default min number of partitions for Hadoop RDDs when not given by user.
+   * @deprecated As of Spark 1.0.0, defaultMinSplits is deprecated, use
+   *            {@link #defaultMinPartitions()} instead
+   */
+  @Deprecated
   def defaultMinSplits: java.lang.Integer = sc.defaultMinSplits
 
+  /** Default min number of partitions for Hadoop RDDs when not given by user */
+  def defaultMinPartitions: java.lang.Integer = sc.defaultMinPartitions
+
   /** Distribute a local Scala collection to form an RDD. */
   def parallelize[T](list: java.util.List[T], numSlices: Int): JavaRDD[T] = {
     implicit val ctag: ClassTag[T] = fakeClassTag
@@ -153,7 +161,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
    * Read a text file from HDFS, a local file system (available on all nodes), or any
    * Hadoop-supported file system URI, and return it as an RDD of Strings.
    */
-  def textFile(path: String, minSplits: Int): JavaRDD[String] = sc.textFile(path, minSplits)
+  def textFile(path: String, minPartitions: Int): JavaRDD[String] =
+    sc.textFile(path, minPartitions)
 
   /**
    * Read a directory of text files from HDFS, a local file system (available on all nodes), or any
@@ -180,17 +189,17 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
    *
    * @note Small files are preferred, large file is also allowable, but may cause bad performance.
    *
-   * @param minSplits A suggestion value of the minimal splitting number for input data.
+   * @param minPartitions A suggestion value of the minimal splitting number for input data.
    */
-  def wholeTextFiles(path: String, minSplits: Int): JavaPairRDD[String, String] =
-    new JavaPairRDD(sc.wholeTextFiles(path, minSplits))
+  def wholeTextFiles(path: String, minPartitions: Int): JavaPairRDD[String, String] =
+    new JavaPairRDD(sc.wholeTextFiles(path, minPartitions))
 
   /**
    * Read a directory of text files from HDFS, a local file system (available on all nodes), or any
    * Hadoop-supported file system URI. Each file is read as a single record and returned in a
    * key-value pair, where the key is the path of each file, the value is the content of each file.
    *
-   * @see `wholeTextFiles(path: String, minSplits: Int)`.
+   * @see `wholeTextFiles(path: String, minPartitions: Int)`.
    */
   def wholeTextFiles(path: String): JavaPairRDD[String, String] =
     new JavaPairRDD(sc.wholeTextFiles(path))
@@ -205,11 +214,11 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
   def sequenceFile[K, V](path: String,
     keyClass: Class[K],
     valueClass: Class[V],
-    minSplits: Int
+    minPartitions: Int
     ): JavaPairRDD[K, V] = {
     implicit val ctagK: ClassTag[K] = ClassTag(keyClass)
     implicit val ctagV: ClassTag[V] = ClassTag(valueClass)
-    new JavaPairRDD(sc.sequenceFile(path, keyClass, valueClass, minSplits))
+    new JavaPairRDD(sc.sequenceFile(path, keyClass, valueClass, minPartitions))
   }
 
   /** Get an RDD for a Hadoop SequenceFile.
@@ -233,9 +242,9 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
    * slow if you use the default serializer (Java serialization), though the nice thing about it is
    * that there's very little effort required to save arbitrary objects.
    */
-  def objectFile[T](path: String, minSplits: Int): JavaRDD[T] = {
+  def objectFile[T](path: String, minPartitions: Int): JavaRDD[T] = {
     implicit val ctag: ClassTag[T] = fakeClassTag
-    sc.objectFile(path, minSplits)(ctag)
+    sc.objectFile(path, minPartitions)(ctag)
   }
 
   /**
@@ -265,11 +274,11 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     inputFormatClass: Class[F],
     keyClass: Class[K],
     valueClass: Class[V],
-    minSplits: Int
+    minPartitions: Int
     ): JavaPairRDD[K, V] = {
     implicit val ctagK: ClassTag[K] = ClassTag(keyClass)
     implicit val ctagV: ClassTag[V] = ClassTag(valueClass)
-    new JavaPairRDD(sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass, minSplits))
+    new JavaPairRDD(sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass, minPartitions))
   }
 
   /**
@@ -304,11 +313,11 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     inputFormatClass: Class[F],
     keyClass: Class[K],
     valueClass: Class[V],
-    minSplits: Int
+    minPartitions: Int
     ): JavaPairRDD[K, V] = {
     implicit val ctagK: ClassTag[K] = ClassTag(keyClass)
     implicit val ctagV: ClassTag[V] = ClassTag(valueClass)
-    new JavaPairRDD(sc.hadoopFile(path, inputFormatClass, keyClass, valueClass, minSplits))
+    new JavaPairRDD(sc.hadoopFile(path, inputFormatClass, keyClass, valueClass, minPartitions))
   }
 
   /** Get an RDD for a Hadoop file with an arbitrary InputFormat
diff --git a/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala b/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
index 80d055a89573b..4cb450577796a 100644
--- a/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
+++ b/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
@@ -48,14 +48,15 @@ private[spark] class WholeTextFileInputFormat extends CombineFileInputFormat[Str
   }
 
   /**
-   * Allow minSplits set by end-user in order to keep compatibility with old Hadoop API.
+   * Allow minPartitions set by end-user in order to keep compatibility with old Hadoop API.
    */
-  def setMaxSplitSize(context: JobContext, minSplits: Int) {
+  def setMaxSplitSize(context: JobContext, minPartitions: Int) {
     val files = listStatus(context)
     val totalLen = files.map { file =>
       if (file.isDir) 0L else file.getLen
     }.sum
-    val maxSplitSize = Math.ceil(totalLen * 1.0 / (if (minSplits == 0) 1 else minSplits)).toLong
+    val maxSplitSize = Math.ceil(totalLen * 1.0 /
+      (if (minPartitions == 0) 1 else minPartitions)).toLong
     super.setMaxSplitSize(maxSplitSize)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 6811e1abb8b70..6547755764dcf 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -87,7 +87,7 @@ private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSp
  * @param inputFormatClass Storage format of the data to be read.
  * @param keyClass Class of the key associated with the inputFormatClass.
  * @param valueClass Class of the value associated with the inputFormatClass.
- * @param minSplits Minimum number of Hadoop Splits (HadoopRDD partitions) to generate.
+ * @param minPartitions Minimum number of HadoopRDD partitions (Hadoop Splits) to generate.
  */
 @DeveloperApi
 class HadoopRDD[K, V](
@@ -97,7 +97,7 @@ class HadoopRDD[K, V](
     inputFormatClass: Class[_ <: InputFormat[K, V]],
     keyClass: Class[K],
     valueClass: Class[V],
-    minSplits: Int)
+    minPartitions: Int)
   extends RDD[(K, V)](sc, Nil) with Logging {
 
   def this(
@@ -106,7 +106,7 @@ class HadoopRDD[K, V](
       inputFormatClass: Class[_ <: InputFormat[K, V]],
       keyClass: Class[K],
       valueClass: Class[V],
-      minSplits: Int) = {
+      minPartitions: Int) = {
     this(
       sc,
       sc.broadcast(new SerializableWritable(conf))
@@ -115,7 +115,7 @@ class HadoopRDD[K, V](
       inputFormatClass,
       keyClass,
       valueClass,
-      minSplits)
+      minPartitions)
   }
 
   protected val jobConfCacheKey = "rdd_%d_job_conf".format(id)
@@ -169,7 +169,7 @@ class HadoopRDD[K, V](
     if (inputFormat.isInstanceOf[Configurable]) {
       inputFormat.asInstanceOf[Configurable].setConf(jobConf)
     }
-    val inputSplits = inputFormat.getSplits(jobConf, minSplits)
+    val inputSplits = inputFormat.getSplits(jobConf, minPartitions)
     val array = new Array[Partition](inputSplits.size)
     for (i <- 0 until inputSplits.size) {
       array(i) = new HadoopPartition(id, i, inputSplits(i))
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index 8684b645bc361..ac1ccc06f238a 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -158,7 +158,7 @@ private[spark] class WholeTextFileRDD(
     keyClass: Class[String],
     valueClass: Class[String],
     @transient conf: Configuration,
-    minSplits: Int)
+    minPartitions: Int)
   extends NewHadoopRDD[String, String](sc, inputFormatClass, keyClass, valueClass, conf) {
 
   override def getPartitions: Array[Partition] = {
@@ -169,7 +169,7 @@ private[spark] class WholeTextFileRDD(
       case _ =>
     }
     val jobContext = newJobContext(conf, jobId)
-    inputFormat.setMaxSplitSize(jobContext, minSplits)
+    inputFormat.setMaxSplitSize(jobContext, minPartitions)
     val rawSplits = inputFormat.getSplits(jobContext).toArray
     val result = new Array[Partition](rawSplits.size)
     for (i <- 0 until rawSplits.size) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 2f3ac10397515..3d6e7e0d5c953 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -57,7 +57,7 @@ object MLUtils {
    * @param labelParser parser for labels, default: 1.0 if label > 0.5 or 0.0 otherwise
    * @param numFeatures number of features, which will be determined from the input data if a
    *                    negative value is given. The default value is -1.
-   * @param minSplits min number of partitions, default: sc.defaultMinSplits
+   * @param minPartitions min number of partitions, default: sc.defaultMinPartitions
    * @return labeled data stored as an RDD[LabeledPoint]
    */
   def loadLibSVMData(
@@ -65,8 +65,8 @@ object MLUtils {
       path: String,
       labelParser: LabelParser,
       numFeatures: Int,
-      minSplits: Int): RDD[LabeledPoint] = {
-    val parsed = sc.textFile(path, minSplits)
+      minPartitions: Int): RDD[LabeledPoint] = {
+    val parsed = sc.textFile(path, minPartitions)
       .map(_.trim)
       .filter(!_.isEmpty)
       .map(_.split(' '))
@@ -101,7 +101,7 @@ object MLUtils {
    * with number of features determined automatically and the default number of partitions.
    */
   def loadLibSVMData(sc: SparkContext, path: String): RDD[LabeledPoint] =
-    loadLibSVMData(sc, path, BinaryLabelParser, -1, sc.defaultMinSplits)
+    loadLibSVMData(sc, path, BinaryLabelParser, -1, sc.defaultMinPartitions)
 
   /**
    * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint],
@@ -112,7 +112,7 @@ object MLUtils {
       sc: SparkContext,
       path: String,
       labelParser: LabelParser): RDD[LabeledPoint] =
-    loadLibSVMData(sc, path, labelParser, -1, sc.defaultMinSplits)
+    loadLibSVMData(sc, path, labelParser, -1, sc.defaultMinPartitions)
 
   /**
    * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint],
@@ -124,7 +124,7 @@ object MLUtils {
       path: String,
       labelParser: LabelParser,
       numFeatures: Int): RDD[LabeledPoint] =
-    loadLibSVMData(sc, path, labelParser, numFeatures, sc.defaultMinSplits)
+    loadLibSVMData(sc, path, labelParser, numFeatures, sc.defaultMinPartitions)
 
   /**
    * :: Experimental ::
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index d8667e84fedff..f63cc4a55fb98 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -248,14 +248,14 @@ def parallelize(self, c, numSlices=None):
         jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices)
         return RDD(jrdd, self, serializer)
 
-    def textFile(self, name, minSplits=None):
+    def textFile(self, name, minPartitions=None):
         """
         Read a text file from HDFS, a local file system (available on all
         nodes), or any Hadoop-supported file system URI, and return it as an
         RDD of Strings.
         """
-        minSplits = minSplits or min(self.defaultParallelism, 2)
-        return RDD(self._jsc.textFile(name, minSplits), self,
+        minPartitions = minPartitions or min(self.defaultParallelism, 2)
+        return RDD(self._jsc.textFile(name, minPartitions), self,
                    UTF8Deserializer())
 
     def wholeTextFiles(self, path):
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index 0da5eb754cb3f..8cfde46186ca4 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -52,7 +52,7 @@ class HadoopTableReader(@transient _tableDesc: TableDesc, @transient sc: HiveCon
   // Choose the minimum number of splits. If mapred.map.tasks is set, then use that unless
   // it is smaller than what Spark suggests.
   private val _minSplitsPerRDD = math.max(
-    sc.hiveconf.getInt("mapred.map.tasks", 1), sc.sparkContext.defaultMinSplits)
+    sc.hiveconf.getInt("mapred.map.tasks", 1), sc.sparkContext.defaultMinPartitions)
 
   // TODO: set aws s3 credentials.
 

From 89f47434e2a6c2f8b80c44d08f866d3a8b8e85c3 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian.cs.zju@gmail.com>
Date: Fri, 18 Apr 2014 10:02:27 -0700
Subject: [PATCH 2/9] Reuses Row object in ExistingRdd.productToRowRdd()

Author: Cheng Lian <lian.cs.zju@gmail.com>

Closes #432 from liancheng/reuseRow and squashes the following commits:

9e6d083 [Cheng Lian] Simplified code with BufferedIterator
52acec9 [Cheng Lian] Reuses Row object in ExistingRdd.productToRowRdd()
---
 .../spark/sql/execution/basicOperators.scala  | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index ab2e62463764a..eedcc7dda02d7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -27,7 +27,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{OrderedDistribution, UnspecifiedDistribution}
 import org.apache.spark.util.MutablePair
 
-
 case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryNode {
   override def output = projectList.map(_.toAttribute)
 
@@ -143,8 +142,24 @@ object ExistingRdd {
   }
 
   def productToRowRdd[A <: Product](data: RDD[A]): RDD[Row] = {
-    // TODO: Reuse the row, don't use map on the product iterator.  Maybe code gen?
-    data.map(r => new GenericRow(r.productIterator.map(convertToCatalyst).toArray): Row)
+    data.mapPartitions { iterator =>
+      if (iterator.isEmpty) {
+        Iterator.empty
+      } else {
+        val bufferedIterator = iterator.buffered
+        val mutableRow = new GenericMutableRow(bufferedIterator.head.productArity)
+
+        bufferedIterator.map { r =>
+          var i = 0
+          while (i < mutableRow.length) {
+            mutableRow(i) = r.productElement(i)
+            i += 1
+          }
+
+          mutableRow
+        }
+      }
+    }
   }
 
   def fromProductRdd[A <: Product : TypeTag](productRdd: RDD[A]) = {

From aa17f022c59af02b04b977da9017671ef14d664a Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Fri, 18 Apr 2014 10:03:15 -0700
Subject: [PATCH 3/9] [SPARK-1520] remove fastutil from dependencies

A quick fix for https://issues.apache.org/jira/browse/SPARK-1520

By excluding fastutil, we bring the number of files in the assembly jar back under 65536, so Java 7 won't create the assembly jar in zip64 format, which cannot be read by Java 6.

With this change, the assembly jar now has about 60000 entries (58000 files), tested with both sbt and maven.

Author: Xiangrui Meng <meng@databricks.com>

Closes #437 from mengxr/remove-fastutil and squashes the following commits:

00f9beb [Xiangrui Meng] remove fastutil from dependencies
---
 pom.xml                  | 7 +++++++
 project/SparkBuild.scala | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index cd204376de5db..4ff18afa227be 100644
--- a/pom.xml
+++ b/pom.xml
@@ -263,6 +263,13 @@
         <groupId>com.clearspring.analytics</groupId>
         <artifactId>stream</artifactId>
         <version>2.5.1</version>
+        <exclusions>
+          <!-- Only HyperLogLog is used, which doesn't depend on fastutil -->
+          <exclusion>
+            <groupId>it.unimi.dsi</groupId>
+            <artifactId>fastutil</artifactId>
+          </exclusion>
+        </exclusions>
       </dependency>
       <!-- In theory we need not directly depend on protobuf since Spark does not directly
            use it. However, when building with Hadoop/YARN 2.2 Maven doesn't correctly bump
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 4f5a3a224f0a2..33f9d644ca66d 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -304,7 +304,7 @@ object SparkBuild extends Build {
   val excludeHadoop = ExclusionRule(organization = "org.apache.hadoop")
   val excludeCurator = ExclusionRule(organization = "org.apache.curator")
   val excludePowermock = ExclusionRule(organization = "org.powermock")
-
+  val excludeFastutil = ExclusionRule(organization = "it.unimi.dsi")
 
   def sparkPreviousArtifact(id: String, organization: String = "org.apache.spark",
       version: String = "0.9.0-incubating", crossVersion: String = "2.10"): Option[sbt.ModuleID] = {
@@ -343,7 +343,7 @@ object SparkBuild extends Build {
         "com.twitter"               %% "chill"            % chillVersion excludeAll(excludeAsm),
         "com.twitter"                % "chill-java"       % chillVersion excludeAll(excludeAsm),
         "org.tachyonproject"         % "tachyon"          % "0.4.1-thrift" excludeAll(excludeHadoop, excludeCurator, excludeEclipseJetty, excludePowermock),
-        "com.clearspring.analytics"  % "stream"           % "2.5.1",
+        "com.clearspring.analytics"  % "stream"           % "2.5.1" excludeAll(excludeFastutil),
         "org.spark-project"          % "pyrolite"         % "2.0"
       ),
     libraryDependencies ++= maybeAvro

From 8aa1f4c4f6d60168737699b5a9eafd6a05660976 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 18 Apr 2014 10:04:02 -0700
Subject: [PATCH 4/9] SPARK-1357 (addendum). More Experimental items in MLlib

Per discussion, this is my suggestion to make ALS Rating, ClassificationModel, RegressionModel experimental for now, to reserve the right to possibly change after 1.0. See what you think of this much.

Author: Sean Owen <sowen@cloudera.com>

Closes #372 from srowen/SPARK-1357Addendum and squashes the following commits:

17cf1ea [Sean Owen] Remove (another) blank line after ":: Experimental ::"
6800e4c [Sean Owen] Remove blank line after ":: Experimental ::"
b3a88d2 [Sean Owen] Make ALS Rating, ClassificationModel, RegressionModel experimental for now, to reserve the right to possibly change after 1.0
---
 .../spark/mllib/classification/ClassificationModel.scala     | 3 +++
 .../scala/org/apache/spark/mllib/recommendation/ALS.scala    | 2 ++
 .../org/apache/spark/mllib/regression/RegressionModel.scala  | 5 +++++
 3 files changed, 10 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
index bd10e2e9e10e2..6332301e30cbd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
@@ -19,11 +19,14 @@ package org.apache.spark.mllib.classification
 
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.Experimental
 
 /**
+ * :: Experimental ::
  * Represents a classification model that predicts to which of a set of categories an example
  * belongs. The categories are represented by double values: 0.0, 1.0, 2.0, etc.
  */
+@Experimental
 trait ClassificationModel extends Serializable {
   /**
    * Predict values for the given data set using the model trained.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index 5cc47de8ffdfc..102742c7c5675 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -56,8 +56,10 @@ private[recommendation] case class InLinkBlock(
 
 
 /**
+ * :: Experimental ::
  * A more compact class to represent a rating than Tuple3[Int, Int, Double].
  */
+@Experimental
 case class Rating(val user: Int, val product: Int, val rating: Double)
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
index 5e4b8a345b1c5..027305aa78926 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
@@ -19,7 +19,12 @@ package org.apache.spark.mllib.regression
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.annotation.Experimental
 
+/**
+ * :: Experimental ::
+ */
+@Experimental
 trait RegressionModel extends Serializable {
   /**
    * Predict values for the given data set using the model trained.

From 3c7a9bae965bac68aba66e8ea28bb4d750e66baa Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Fri, 18 Apr 2014 10:05:00 -0700
Subject: [PATCH 5/9] SPARK-1523: improve the readability of code in AkkaUtil

Actually it is separated from https://github.com/apache/spark/pull/85 as suggested by @rxin

compare

https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala#L122

and

https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala#L117

the first one use get and then toLong, the second one getLong....better to make them consistent

very very small fix........

Author: CodingCat <zhunansjtu@gmail.com>

Closes #434 from CodingCat/SPARK-1523 and squashes the following commits:

0e86f3f [CodingCat] improve the readability of code in AkkaUtil
---
 core/src/main/scala/org/apache/spark/util/AkkaUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
index d0ff17db632c1..8afe09a117ebc 100644
--- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
@@ -119,7 +119,7 @@ private[spark] object AkkaUtils extends Logging {
 
   /** Returns the default Spark timeout to use for Akka remote actor lookup. */
   def lookupTimeout(conf: SparkConf): FiniteDuration = {
-    Duration.create(conf.get("spark.akka.lookupTimeout", "30").toLong, "seconds")
+    Duration.create(conf.getLong("spark.akka.lookupTimeout", 30), "seconds")
   }
 
   /** Returns the configured max frame size for Akka messages in bytes. */

From 81a152c54bff21854de731476f62c8fd50dd29f7 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@apache.org>
Date: Fri, 18 Apr 2014 10:10:13 -0700
Subject: [PATCH 6/9] Fixed broken pyspark shell.

Author: Reynold Xin <rxin@apache.org>

Closes #444 from rxin/pyspark and squashes the following commits:

fc11356 [Reynold Xin] Made the PySpark shell version checking compatible with Python 2.6.
571830b [Reynold Xin] Fixed broken pyspark shell.
---
 python/pyspark/shell.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
index e8ba050655a30..d172d588bfbd8 100644
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@@ -22,7 +22,7 @@
 """
 
 import sys
-if sys.version_info.major != 2:
+if sys.version_info[0] != 2:
     print("Error: Default Python used is Python%s" % sys.version_info.major)
     print("\tSet env variable PYSPARK_PYTHON to Python2 binary and re-run it.")
     sys.exit(1)
@@ -53,7 +53,7 @@
     platform.python_version(),
     platform.python_build()[0],
     platform.python_build()[1]))
-    print("Spark context available as sc.")
+print("SparkContext available as sc.")
 
 if add_files != None:
     print("Adding files: [%s]" % ", ".join(add_files))

From c399baa0fc40be7aa51835aaeadcd5d768dfdb95 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Fri, 18 Apr 2014 12:04:13 -0700
Subject: [PATCH 7/9] SPARK-1456 Remove view bounds on Ordered in favor of a
 context bound on Ordering.

This doesn't require creating new Ordering objects per row.  Additionally, [view bounds are going to be deprecated](https://issues.scala-lang.org/browse/SI-7629), so we should get rid of them while APIs are still flexible.

Author: Michael Armbrust <michael@databricks.com>

Closes #410 from marmbrus/viewBounds and squashes the following commits:

c574221 [Michael Armbrust] fix example.
812008e [Michael Armbrust] Update Java API.
1b9b85c [Michael Armbrust] Update scala doc.
35798a8 [Michael Armbrust] Remove view bounds on Ordered in favor of a context bound on Ordering.
---
 .../scala/org/apache/spark/Partitioner.scala  |  8 +++---
 .../scala/org/apache/spark/SparkContext.scala |  2 +-
 .../apache/spark/api/java/JavaPairRDD.scala   | 10 ++-----
 .../spark/rdd/OrderedRDDFunctions.scala       | 26 +++++++++++++++----
 .../apache/spark/util/CollectionsUtil.scala   |  2 +-
 5 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index ad9988226470c..9155159cf6aeb 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -89,12 +89,14 @@ class HashPartitioner(partitions: Int) extends Partitioner {
  * A [[org.apache.spark.Partitioner]] that partitions sortable records by range into roughly
  * equal ranges. The ranges are determined by sampling the content of the RDD passed in.
  */
-class RangePartitioner[K <% Ordered[K]: ClassTag, V](
+class RangePartitioner[K : Ordering : ClassTag, V](
     partitions: Int,
     @transient rdd: RDD[_ <: Product2[K,V]],
     private val ascending: Boolean = true)
   extends Partitioner {
 
+  private val ordering = implicitly[Ordering[K]]
+
   // An array of upper bounds for the first (partitions - 1) partitions
   private val rangeBounds: Array[K] = {
     if (partitions == 1) {
@@ -103,7 +105,7 @@ class RangePartitioner[K <% Ordered[K]: ClassTag, V](
       val rddSize = rdd.count()
       val maxSampleSize = partitions * 20.0
       val frac = math.min(maxSampleSize / math.max(rddSize, 1), 1.0)
-      val rddSample = rdd.sample(false, frac, 1).map(_._1).collect().sortWith(_ < _)
+      val rddSample = rdd.sample(false, frac, 1).map(_._1).collect().sorted
       if (rddSample.length == 0) {
         Array()
       } else {
@@ -126,7 +128,7 @@ class RangePartitioner[K <% Ordered[K]: ClassTag, V](
     var partition = 0
     if (rangeBounds.length < 1000) {
       // If we have less than 100 partitions naive search
-      while (partition < rangeBounds.length && k > rangeBounds(partition)) {
+      while (partition < rangeBounds.length && ordering.gt(k, rangeBounds(partition))) {
         partition += 1
       }
     } else {
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index ee5637371fdca..d3ef75bc7335a 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1273,7 +1273,7 @@ object SparkContext extends Logging {
       rdd: RDD[(K, V)]) =
     new SequenceFileRDDFunctions(rdd)
 
-  implicit def rddToOrderedRDDFunctions[K <% Ordered[K]: ClassTag, V: ClassTag](
+  implicit def rddToOrderedRDDFunctions[K : Ordering : ClassTag, V: ClassTag](
       rdd: RDD[(K, V)]) =
     new OrderedRDDFunctions[K, V, (K, V)](rdd)
 
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index e5b2c8a5e7cb1..b3ec270281ae4 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -626,10 +626,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * order of the keys).
    */
   def sortByKey(comp: Comparator[K], ascending: Boolean): JavaPairRDD[K, V] = {
-    class KeyOrdering(val a: K) extends Ordered[K] {
-      override def compare(b: K) = comp.compare(a, b)
-    }
-    implicit def toOrdered(x: K): Ordered[K] = new KeyOrdering(x)
+    implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
     fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending))
   }
 
@@ -640,10 +637,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * order of the keys).
    */
   def sortByKey(comp: Comparator[K], ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = {
-    class KeyOrdering(val a: K) extends Ordered[K] {
-      override def compare(b: K) = comp.compare(a, b)
-    }
-    implicit def toOrdered(x: K): Ordered[K] = new KeyOrdering(x)
+    implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
     fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending, numPartitions))
   }
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala
index d5691f2267bfa..6a3f698444283 100644
--- a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala
@@ -24,15 +24,31 @@ import org.apache.spark.{Logging, RangePartitioner}
 /**
  * Extra functions available on RDDs of (key, value) pairs where the key is sortable through
  * an implicit conversion. Import `org.apache.spark.SparkContext._` at the top of your program to
- * use these functions. They will work with any key type that has a `scala.math.Ordered`
- * implementation.
+ * use these functions. They will work with any key type `K` that has an implicit `Ordering[K]` in
+ * scope.  Ordering objects already exist for all of the standard primitive types.  Users can also
+ * define their own orderings for custom types, or to override the default ordering.  The implicit
+ * ordering that is in the closest scope will be used.
+ *
+ * {{{
+ *   import org.apache.spark.SparkContext._
+ *
+ *   val rdd: RDD[(String, Int)] = ...
+ *   implicit val caseInsensitiveOrdering = new Ordering[String] {
+ *     override def compare(a: String, b: String) = a.toLowerCase.compare(b.toLowerCase)
+ *   }
+ *
+ *   // Sort by key, using the above case insensitive ordering.
+ *   rdd.sortByKey()
+ * }}}
  */
-class OrderedRDDFunctions[K <% Ordered[K]: ClassTag,
+class OrderedRDDFunctions[K : Ordering : ClassTag,
                           V: ClassTag,
                           P <: Product2[K, V] : ClassTag](
     self: RDD[P])
   extends Logging with Serializable {
 
+  private val ordering = implicitly[Ordering[K]]
+
   /**
    * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
    * `collect` or `save` on the resulting RDD will return or output an ordered list of records
@@ -45,9 +61,9 @@ class OrderedRDDFunctions[K <% Ordered[K]: ClassTag,
     shuffled.mapPartitions(iter => {
       val buf = iter.toArray
       if (ascending) {
-        buf.sortWith((x, y) => x._1 < y._1).iterator
+        buf.sortWith((x, y) => ordering.lt(x._1, y._1)).iterator
       } else {
-        buf.sortWith((x, y) => x._1 > y._1).iterator
+        buf.sortWith((x, y) => ordering.gt(x._1, y._1)).iterator
       }
     }, preservesPartitioning = true)
   }
diff --git a/core/src/main/scala/org/apache/spark/util/CollectionsUtil.scala b/core/src/main/scala/org/apache/spark/util/CollectionsUtil.scala
index 93235031f3ad5..e4c254b9dd6b9 100644
--- a/core/src/main/scala/org/apache/spark/util/CollectionsUtil.scala
+++ b/core/src/main/scala/org/apache/spark/util/CollectionsUtil.scala
@@ -23,7 +23,7 @@ import scala.Array
 import scala.reflect._
 
 private[spark] object CollectionsUtils {
-  def makeBinarySearch[K <% Ordered[K] : ClassTag] : (Array[K], K) => Int = {
+  def makeBinarySearch[K : Ordering : ClassTag] : (Array[K], K) => Int = {
     classTag[K] match {
       case ClassTag.Float =>
         (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Float]], x.asInstanceOf[Float])

From 2089e0e7e7c73656daee7b56f8100332f4d2175c Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 18 Apr 2014 17:49:22 -0700
Subject: [PATCH 8/9] SPARK-1482: Fix potential resource leaks in
 saveAsHadoopDataset and save...

...AsNewAPIHadoopDataset

`writer.close` should be put in the `finally` block to avoid potential resource leaks.

JIRA: https://issues.apache.org/jira/browse/SPARK-1482

Author: zsxwing <zsxwing@gmail.com>

Closes #400 from zsxwing/SPARK-1482 and squashes the following commits:

06b197a [zsxwing] SPARK-1482: Fix potential resource leaks in saveAsHadoopDataset and saveAsNewAPIHadoopDataset
---
 .../apache/spark/rdd/PairRDDFunctions.scala   | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 343e4325c0ef0..d250bef6aad0f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -693,11 +693,15 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
       val committer = format.getOutputCommitter(hadoopContext)
       committer.setupTask(hadoopContext)
       val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K,V]]
-      while (iter.hasNext) {
-        val (k, v) = iter.next()
-        writer.write(k, v)
+      try {
+        while (iter.hasNext) {
+          val (k, v) = iter.next()
+          writer.write(k, v)
+        }
+      }
+      finally {
+        writer.close(hadoopContext)
       }
-      writer.close(hadoopContext)
       committer.commitTask(hadoopContext)
       return 1
     }
@@ -750,15 +754,17 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
 
       writer.setup(context.stageId, context.partitionId, attemptNumber)
       writer.open()
-
-      var count = 0
-      while(iter.hasNext) {
-        val record = iter.next()
-        count += 1
-        writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])
+      try {
+        var count = 0
+        while(iter.hasNext) {
+          val record = iter.next()
+          count += 1
+          writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])
+        }
+      }
+      finally {
+        writer.close()
       }
-
-      writer.close()
       writer.commit()
     }
 

From 28238c81d9d81dba5d880cbd7ee910ec326bdc79 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@apache.org>
Date: Fri, 18 Apr 2014 22:34:39 -0700
Subject: [PATCH 9/9] README update

Author: Reynold Xin <rxin@apache.org>

Closes #443 from rxin/readme and squashes the following commits:

16853de [Reynold Xin] Updated SBT and Scala instructions.
3ac3ceb [Reynold Xin] README update
---
 README.md | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index dc8135b9b8b51..e2d1dcb5672ff 100644
--- a/README.md
+++ b/README.md
@@ -10,20 +10,33 @@ guide, on the project webpage at <http://spark.apache.org/documentation.html>.
 This README file only contains basic setup instructions.
 
 
-## Building
+## Building Spark
 
-Spark requires Scala 2.10. The project is built using Simple Build Tool (SBT),
-which can be obtained [here](http://www.scala-sbt.org). If SBT is installed we
-will use the system version of sbt otherwise we will attempt to download it
-automatically. To build Spark and its example programs, run:
+Spark is built on Scala 2.10. To build Spark and its example programs, run:
 
     ./sbt/sbt assembly
 
-Once you've built Spark, the easiest way to start using it is the shell:
+## Interactive Scala Shell
+
+The easiest way to start using Spark is through the Scala shell:
 
     ./bin/spark-shell
 
-Or, for the Python API, the Python shell (`./bin/pyspark`).
+Try the following command, which should return 1000:
+
+    scala> sc.parallelize(1 to 1000).count()
+
+## Interactive Python Shell
+
+Alternatively, if you prefer Python, you can use the Python shell:
+
+    ./bin/pyspark
+    
+And run the following command, which should also return 1000:
+
+    >>> sc.parallelize(range(1000)).count()
+
+## Example Programs
 
 Spark also comes with several sample programs in the `examples` directory.
 To run one of them, use `./bin/run-example <class> <params>`. For example:
@@ -38,13 +51,13 @@ All of the Spark samples take a `<master>` parameter that is the cluster URL
 to connect to. This can be a mesos:// or spark:// URL, or "local" to run
 locally with one thread, or "local[N]" to run locally with N threads.
 
-## Running tests
+## Running Tests
 
-Testing first requires [Building](#building) Spark. Once Spark is built, tests
+Testing first requires [building Spark](#building-spark). Once Spark is built, tests
 can be run using:
 
-`./sbt/sbt test`
- 
+    ./sbt/sbt test
+
 ## A Note About Hadoop Versions
 
 Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported